From 926259c411c1022812ffb7fe88ca61f0180bd778 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Thu, 14 Dec 2017 09:51:09 +0800
Subject: [PATCH 001/450] TST: test case for string

---
 tensorflow/python/kernel_tests/scatter_nd_ops_test.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 9f57949515..83d69c651a 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -364,6 +364,16 @@ class ScatterNdTest(test.TestCase):
     del input_  # input_ is not used in scatter_nd
     return array_ops.scatter_nd(indices, updates, shape)
 
+  def testString(self):
+    indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
+    updates = constant_op.constant(["four", "three", "one", "seven"], dtype=dtypes.string)
+    expected = np.array(["", "one", "", "three", "four", "", "", "seven"])
+    scatter = self.scatter_nd(indices, updates, shape=(8,))
+
+    with self.test_session() as sess:
+      result = sess.run(scatter)
+      self.assertTrue(np.array_equal(result, expected))
+
   def testRank3ValidShape(self):
     indices = array_ops.zeros([2, 2, 2], dtypes.int32)
     updates = array_ops.zeros([2, 2, 2], dtypes.int32)
-- 
GitLab


From 005840c6e2d2a4c25ecd293162a38a79dedf1a4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Thu, 14 Dec 2017 10:06:44 +0800
Subject: [PATCH 002/450] ENH: supports string for cpu

---
 tensorflow/core/kernels/scatter_nd_op.cc         | 1 +
 tensorflow/core/kernels/scatter_nd_op_cpu_impl.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 3a95dd1773..0caa7bd317 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -241,6 +241,7 @@ class ScatterNdUpdateOp : public OpKernel {
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_ADD_SUB_CPU);
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_UPDATE_CPU);
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_CPU);
+TF_CALL_string(REGISTER_SCATTER_ND_CPU);
 
 // Registers GPU kernels.
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
index cffc326174..155d354d85 100644
--- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@@ -160,6 +160,7 @@ struct ScatterNdFunctor<CPUDevice, T, Index, OP, IXDIM> {
   REGISTER_SCATTER_ND_INDEX(type, scatter_nd_op::UpdateOp::SUB);
 
 TF_CALL_ALL_TYPES(REGISTER_SCATTER_ND_UPDATE);
+REGISTER_SCATTER_ND_INDEX(string, scatter_nd_op::UpdateOp::ADD);
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_MATH)
 
 #undef REGISTER_SCATTER_ND_MATH
-- 
GitLab


From d887d2bcfc819034b17e812a9a60460e2d61e447 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Thu, 14 Dec 2017 12:14:40 +0800
Subject: [PATCH 003/450] TST: ignore NonAliasingAdd

---
 tensorflow/python/kernel_tests/scatter_nd_ops_test.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 83d69c651a..03b2f892c6 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -594,6 +594,10 @@ class ScatterNdNonAliasingAddTest(ScatterNdTest):
         shape, dtype=updates.dtype))
     return array_ops.scatter_nd_non_aliasing_add(input_, indices, updates)
 
+  def testString(self):
+    # Not supported yet.
+    pass
+
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From 4b697e0d9472215c706bdb36bb72986cdce78edd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Thu, 14 Dec 2017 13:51:34 +0800
Subject: [PATCH 004/450] DOC: modify document

---
 tensorflow/core/ops/array_ops.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 5a31f433ce..933ebe6b63 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -5332,12 +5332,13 @@ REGISTER_OP("ScatterNd")
     .Attr("Tindices: {int32, int64}")
     .SetShapeFn(ScatterNdShape)
     .Doc(R"doc(
-Scatter `updates` into a new (initially zero) tensor according to `indices`.
+Scatter `updates` into a new (initially zero for numeric, empty for string)
+tensor according to `indices`.
 
-Creates a new tensor by applying sparse `updates` to individual
-values or slices within a zero tensor of the given `shape` according to
-indices.  This operator is the inverse of the @{tf.gather_nd} operator which
-extracts values or slices from a given tensor.
+Creates a new tensor by applying sparse `updates` to individual values or
+slices within a zero (or empty string) tensor of the given `shape`
+according to indices. This operator is the inverse of the @{tf.gather_nd}
+operator which extracts values or slices from a given tensor.
 
 **WARNING**: The order in which updates are applied is nondeterministic, so the
 output will be nondeterministic if `indices` contains duplicates.
-- 
GitLab


From 597403e03680d69b72dbfa669f7bbdc77ce21ec9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Wed, 20 Dec 2017 16:34:48 +0800
Subject: [PATCH 005/450] CLN: conform docstring

---
 tensorflow/core/ops/array_ops.cc | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 933ebe6b63..89b6eb7162 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -5332,13 +5332,12 @@ REGISTER_OP("ScatterNd")
     .Attr("Tindices: {int32, int64}")
     .SetShapeFn(ScatterNdShape)
     .Doc(R"doc(
-Scatter `updates` into a new (initially zero for numeric, empty for string)
-tensor according to `indices`.
+Scatter `updates` into a new empty tensor according to `indices`.
 
 Creates a new tensor by applying sparse `updates` to individual values or
-slices within a zero (or empty string) tensor of the given `shape`
-according to indices. This operator is the inverse of the @{tf.gather_nd}
-operator which extracts values or slices from a given tensor.
+slices within a tensor (initially zero for numeric, empty for string) of
+the given `shape` according to indices. This operator is the inverse of the
+@{tf.gather_nd} operator which extracts values or slices from a given tensor.
 
 **WARNING**: The order in which updates are applied is nondeterministic, so the
 output will be nondeterministic if `indices` contains duplicates.
-- 
GitLab


From 9c272adf248228408448db6219b238145f5a02ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Fri, 16 Feb 2018 10:38:50 +0800
Subject: [PATCH 006/450] DOC: move doc to api def file

---
 .../core/api_def/base_api/api_def_ScatterNd.pbtxt      | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
index 4cb8c064fc..4e95895f54 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
@@ -25,12 +25,12 @@ A new tensor with the given shape and updates applied according
 to the indices.
 END
   }
-  summary: "Scatter `updates` into a new (initially zero) tensor according to `indices`."
+  summary: "Scatter `updates` into a new empty tensor according to `indices`."
   description: <<END
-Creates a new tensor by applying sparse `updates` to individual
-values or slices within a zero tensor of the given `shape` according to
-indices.  This operator is the inverse of the @{tf.gather_nd} operator which
-extracts values or slices from a given tensor.
+Creates a new tensor by applying sparse `updates` to individual values or
+slices within a tensor (initially zero for numeric, empty for string) of
+the given `shape` according to indices.  This operator is the inverse of the
+@{tf.gather_nd} operator which extracts values or slices from a given tensor.
 
 **WARNING**: The order in which updates are applied is nondeterministic, so the
 output will be nondeterministic if `indices` contains duplicates.
-- 
GitLab


From cee41f9d10b81ce3b49f566ddd448a7f3f2872c3 Mon Sep 17 00:00:00 2001
From: KB Sriram <kbsriram@gmail.com>
Date: Wed, 7 Mar 2018 08:11:03 -0800
Subject: [PATCH 007/450] C++ gradient for StridedSlice

See https://github.com/tensorflow/tensorflow/issues/9645
---
 tensorflow/cc/gradients/array_grad.cc      | 36 ++++++++++++++++++++++
 tensorflow/cc/gradients/array_grad_test.cc | 24 +++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc
index 6545e4ee3e..ff348fadb2 100644
--- a/tensorflow/cc/gradients/array_grad.cc
+++ b/tensorflow/cc/gradients/array_grad.cc
@@ -385,6 +385,42 @@ Status MirrorPadGradGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("MirrorPadGrad", MirrorPadGradGrad);
 
+Status StridedSliceGradHelper(const Scope& scope, const Operation& op,
+                              const std::vector<Output>& grad_inputs,
+                              std::vector<Output>* grad_outputs) {
+  Input x = Shape(scope, op.input(0));
+  Input begin = op.input(1);
+  Input end = op.input(2);
+  Input strides = op.input(3);
+  int64 begin_mask;
+  int64 end_mask;
+  int64 ellipsis_mask;
+  int64 new_axis_mask;
+  int64 shrink_axis_mask;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "begin_mask", &begin_mask));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "end_mask", &end_mask));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "ellipsis_mask", &ellipsis_mask));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "new_axis_mask", &new_axis_mask));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "shrink_axis_mask", &shrink_axis_mask));
+  grad_outputs->push_back(
+      StridedSliceGrad(scope, x, begin, end, strides, grad_inputs[0],
+                       StridedSliceGrad::BeginMask(begin_mask)
+                           .EndMask(end_mask)
+                           .EllipsisMask(ellipsis_mask)
+                           .NewAxisMask(new_axis_mask)
+                           .ShrinkAxisMask(shrink_axis_mask)));
+  // No gradients returned for begin, end and strides
+  grad_outputs->push_back(NoGradient());
+  grad_outputs->push_back(NoGradient());
+  grad_outputs->push_back(NoGradient());
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("StridedSlice", StridedSliceGradHelper);
+
 }  // anonymous namespace
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/array_grad_test.cc b/tensorflow/cc/gradients/array_grad_test.cc
index 4a215fcc92..2a2180297c 100644
--- a/tensorflow/cc/gradients/array_grad_test.cc
+++ b/tensorflow/cc/gradients/array_grad_test.cc
@@ -354,5 +354,29 @@ TEST_F(ArrayGradTest, MirrorPadGradGrad_Symmetric) {
   RunTest(x, x_shape, y, y_shape);
 }
 
+TEST_F(ArrayGradTest, StridedSliceGrad) {
+  TensorShape x_shape({6, 4, 4});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+
+  // y = x[2:6:2, 1:3, 1:3]
+  auto y = StridedSlice(scope_, x, {2, 1, 1}, {6, 3, 3}, {2, 1, 1});
+  // y.shape = [2, 2, 2];
+  RunTest(x, x_shape, y, {2, 2, 2});
+
+  // y = x[2:6:2, 1:3, 1:3]
+  // begin_mask = 1<<1 (ignore begin_index = 1)
+  // end_mask = 1<<2 (ignore end_index = 2)
+  y = StridedSlice(scope_, x, {2, 1, 1}, {6, 3, 3}, {2, 1, 1},
+                   StridedSlice::BeginMask(1<<1).EndMask(1<<2));
+  // y.shape = [2, 3, 3];
+  RunTest(x, x_shape, y, {2, 3, 3});
+
+  // y = [tf.newaxis, 2:6:2, 1:3, 1:3]
+  y = StridedSlice(scope_, x, {0, 2, 1, 1}, {0, 6, 3, 3}, {1, 2, 1, 1},
+                   StridedSlice::NewAxisMask(1<<0));
+  // y.shape = [1, 2, 2, 2];
+  RunTest(x, x_shape, y, {1, 2, 2, 2});
+}
+
 }  // namespace
 }  // namespace tensorflow
-- 
GitLab


From e31fb25f4e3989a846a8e54d789a3bf5efff0cea Mon Sep 17 00:00:00 2001
From: KB Sriram <kbsriram@gmail.com>
Date: Thu, 8 Mar 2018 07:40:24 -0800
Subject: [PATCH 008/450] Clang-format fixes.

---
 tensorflow/cc/gradients/array_grad_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/cc/gradients/array_grad_test.cc b/tensorflow/cc/gradients/array_grad_test.cc
index 2a2180297c..de3bd0fc9e 100644
--- a/tensorflow/cc/gradients/array_grad_test.cc
+++ b/tensorflow/cc/gradients/array_grad_test.cc
@@ -367,13 +367,13 @@ TEST_F(ArrayGradTest, StridedSliceGrad) {
   // begin_mask = 1<<1 (ignore begin_index = 1)
   // end_mask = 1<<2 (ignore end_index = 2)
   y = StridedSlice(scope_, x, {2, 1, 1}, {6, 3, 3}, {2, 1, 1},
-                   StridedSlice::BeginMask(1<<1).EndMask(1<<2));
+                   StridedSlice::BeginMask(1 << 1).EndMask(1 << 2));
   // y.shape = [2, 3, 3];
   RunTest(x, x_shape, y, {2, 3, 3});
 
   // y = [tf.newaxis, 2:6:2, 1:3, 1:3]
   y = StridedSlice(scope_, x, {0, 2, 1, 1}, {0, 6, 3, 3}, {1, 2, 1, 1},
-                   StridedSlice::NewAxisMask(1<<0));
+                   StridedSlice::NewAxisMask(1 << 0));
   // y.shape = [1, 2, 2, 2];
   RunTest(x, x_shape, y, {1, 2, 2, 2});
 }
-- 
GitLab


From fe46c22a80b068b2b30f1e44f2f950ba6b6e907b Mon Sep 17 00:00:00 2001
From: Joe Yearsley <josephelliotyearsley@gmail.com>
Date: Fri, 9 Mar 2018 22:41:37 +0000
Subject: [PATCH 009/450] Update fold_old_batch_norms.cc

Fixes the problem of using fused batch normalization and this transform, only shows up when using 'NCHW' as the default is 'NHWC'.
---
 tensorflow/tools/graph_transforms/fold_old_batch_norms.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
index d86f65325b..a5acd53ad6 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
@@ -159,6 +159,7 @@ Status FuseScaleOffsetToConvWeights(const std::vector<float>& scale_values,
   NodeDef bias_add_node;
   bias_add_node.set_op("BiasAdd");
   bias_add_node.set_name(conv_output_name);
+  bias_add_op.attr["data_format"].CopyFrom(conv_node.attr["data_format"])
   CopyNodeAttr(conv_node, "T", "T", &bias_add_node);
   AddNodeInput(conv_node.name(), &bias_add_node);
   AddNodeInput(bias_offset_node.name(), &bias_add_node);
-- 
GitLab


From 1ad788b136d509888cf7d484f762e31b2ee37a50 Mon Sep 17 00:00:00 2001
From: Joe Yearsley <josephelliotyearsley@gmail.com>
Date: Fri, 9 Mar 2018 22:46:30 +0000
Subject: [PATCH 010/450] Update fold_old_batch_norms.cc

---
 tensorflow/tools/graph_transforms/fold_old_batch_norms.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
index a5acd53ad6..3376a81312 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
@@ -159,7 +159,7 @@ Status FuseScaleOffsetToConvWeights(const std::vector<float>& scale_values,
   NodeDef bias_add_node;
   bias_add_node.set_op("BiasAdd");
   bias_add_node.set_name(conv_output_name);
-  bias_add_op.attr["data_format"].CopyFrom(conv_node.attr["data_format"])
+  bias_add_node.attr["data_format"].CopyFrom(conv_node.attr["data_format"])
   CopyNodeAttr(conv_node, "T", "T", &bias_add_node);
   AddNodeInput(conv_node.name(), &bias_add_node);
   AddNodeInput(bias_offset_node.name(), &bias_add_node);
-- 
GitLab


From d0680917907671f5870818d21ee0ff77bf7c3ff6 Mon Sep 17 00:00:00 2001
From: Joe Yearsley <josephelliotyearsley@gmail.com>
Date: Fri, 9 Mar 2018 23:56:52 +0000
Subject: [PATCH 011/450] Update fold_old_batch_norms.cc

---
 tensorflow/tools/graph_transforms/fold_old_batch_norms.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
index 3376a81312..59f3ffdcda 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
@@ -159,7 +159,7 @@ Status FuseScaleOffsetToConvWeights(const std::vector<float>& scale_values,
   NodeDef bias_add_node;
   bias_add_node.set_op("BiasAdd");
   bias_add_node.set_name(conv_output_name);
-  bias_add_node.attr["data_format"].CopyFrom(conv_node.attr["data_format"])
+  CopyNodeAttr(conv_node, "data_format", "data_format", &bias_add_node);
   CopyNodeAttr(conv_node, "T", "T", &bias_add_node);
   AddNodeInput(conv_node.name(), &bias_add_node);
   AddNodeInput(bias_offset_node.name(), &bias_add_node);
-- 
GitLab


From 0c6845db28bd690eb848dde837f23fef6a0a8eed Mon Sep 17 00:00:00 2001
From: josephyearsley <joggino23@gmail.com>
Date: Sat, 31 Mar 2018 17:40:40 +0100
Subject: [PATCH 012/450] Copy data_format if the original node has that attr.

---
 tensorflow/tools/graph_transforms/fold_old_batch_norms.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
index 59f3ffdcda..988ba25e36 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
@@ -159,7 +159,9 @@ Status FuseScaleOffsetToConvWeights(const std::vector<float>& scale_values,
   NodeDef bias_add_node;
   bias_add_node.set_op("BiasAdd");
   bias_add_node.set_name(conv_output_name);
-  CopyNodeAttr(conv_node, "data_format", "data_format", &bias_add_node);
+  if (HasAttr(conv_node, "data_format")) {
+    CopyNodeAttr(conv_node, "data_format", "data_format", &bias_add_node);
+  }
   CopyNodeAttr(conv_node, "T", "T", &bias_add_node);
   AddNodeInput(conv_node.name(), &bias_add_node);
   AddNodeInput(bias_offset_node.name(), &bias_add_node);
-- 
GitLab


From 5ca9fedc6b3f9619a3bcf7a5a4a523668055f57d Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Mon, 2 Apr 2018 13:02:01 +0800
Subject: [PATCH 013/450] Fix adam optimizer related math equation rendering
 format

---
 .../opt/python/training/lazy_adam_optimizer.py   |  6 +++---
 tensorflow/contrib/optimizer_v2/adam.py          | 16 ++++++++--------
 .../api_def/base_api/api_def_ApplyAdam.pbtxt     |  8 ++++----
 .../base_api/api_def_ResourceApplyAdam.pbtxt     |  8 ++++----
 tensorflow/python/training/adam.py               | 16 ++++++++--------
 5 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py
index aeca900bc8..72117c1e81 100644
--- a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py
@@ -56,21 +56,21 @@ class LazyAdamOptimizer(adam.AdamOptimizer):
     epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
     lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
 
-    # m := beta1 * m + (1 - beta1) * g_t
+    # \\(m := beta1 * m + (1 - beta1) * g_t\\)
     m = self.get_slot(var, "m")
     m_t = state_ops.scatter_update(m, grad.indices,
                                    beta1_t * array_ops.gather(m, grad.indices) +
                                    (1 - beta1_t) * grad.values,
                                    use_locking=self._use_locking)
 
-    # v := beta2 * v + (1 - beta2) * (g_t * g_t)
+    # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
     v = self.get_slot(var, "v")
     v_t = state_ops.scatter_update(v, grad.indices,
                                    beta2_t * array_ops.gather(v, grad.indices) +
                                    (1 - beta2_t) * math_ops.square(grad.values),
                                    use_locking=self._use_locking)
 
-    # variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))
+    # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
     m_t_slice = array_ops.gather(m_t, grad.indices)
     v_t_slice = array_ops.gather(v_t, grad.indices)
     denominator_slice = math_ops.sqrt(v_t_slice) + epsilon_t
diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py
index 42b7f92a76..e863ca1244 100644
--- a/tensorflow/contrib/optimizer_v2/adam.py
+++ b/tensorflow/contrib/optimizer_v2/adam.py
@@ -41,21 +41,21 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):
     Initialization:
 
     ```
-    m_0 <- 0 (Initialize initial 1st moment vector)
-    v_0 <- 0 (Initialize initial 2nd moment vector)
-    t <- 0 (Initialize timestep)
+    \\(m_0 <- 0\\) (Initialize initial 1st moment vector)
+    \\(v_0 <- 0\\) (Initialize initial 2nd moment vector)
+    \\(t <- 0\\) (Initialize timestep)
     ```
 
     The update rule for `variable` with gradient `g` uses an optimization
     described at the end of section2 of the paper:
 
     ```
-    t <- t + 1
-    lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+    $$t <- t + 1$$
+    $$lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)$$
 
-    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-    v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g
-    variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+    $$m_t <- beta1 * m_{t-1} + (1 - beta1) * g$$
+    $$v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g$$
+    $$variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)$$
     ```
 
     The default value of 1e-8 for epsilon might not be a good default in
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
index c2858a1bfb..9bffaa79f5 100644
--- a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
@@ -82,9 +82,9 @@ END
   }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: <<END
-lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
-v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
-variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+$$lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)$$
+$$m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t$$
+$$v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t$$
+$$variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)$$
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
index bea1fd6762..109b68e472 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
@@ -76,9 +76,9 @@ END
   }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: <<END
-lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
-v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
-variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+$$lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)$$
+$$m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t$$
+$$v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t$$
+$$variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)$$
 END
 }
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 006e360389..178eddc664 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -44,21 +44,21 @@ class AdamOptimizer(optimizer.Optimizer):
     Initialization:
 
     ```
-    m_0 <- 0 (Initialize initial 1st moment vector)
-    v_0 <- 0 (Initialize initial 2nd moment vector)
-    t <- 0 (Initialize timestep)
+    \\(m_0 <- 0\\) (Initialize initial 1st moment vector)
+    \\(v_0 <- 0\\) (Initialize initial 2nd moment vector)
+    \\(t <- 0\\) (Initialize timestep)
     ```
 
     The update rule for `variable` with gradient `g` uses an optimization
     described at the end of section2 of the paper:
 
     ```
-    t <- t + 1
-    lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+    $$t <- t + 1$$
+    $$lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)$$
 
-    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-    v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g
-    variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+    $$m_t <- beta1 * m_{t-1} + (1 - beta1) * g$$
+    $$v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g$$
+    $$variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)$$
     ```
 
     The default value of 1e-8 for epsilon might not be a good default in
-- 
GitLab


From 85763f5192bc772daf672b183ec63edef4e0047c Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Mon, 2 Apr 2018 13:11:26 +0800
Subject: [PATCH 014/450] Fix minor typo

---
 tensorflow/contrib/optimizer_v2/adam.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py
index e863ca1244..9bc160c0b9 100644
--- a/tensorflow/contrib/optimizer_v2/adam.py
+++ b/tensorflow/contrib/optimizer_v2/adam.py
@@ -51,11 +51,11 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):
 
     ```
     $$t <- t + 1$$
-    $$lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)$$
+    $$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
 
-    $$m_t <- beta1 * m_{t-1} + (1 - beta1) * g$$
-    $$v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g$$
-    $$variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)$$
+    $$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$
+    $$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+    $$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
     ```
 
     The default value of 1e-8 for epsilon might not be a good default in
-- 
GitLab


From 41074cd435a5d8b3831db8333b3669877b15a2c9 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Mon, 2 Apr 2018 13:14:48 +0800
Subject: [PATCH 015/450] Fix minor typo

---
 tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt | 8 ++++----
 tensorflow/python/training/adam.py                       | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
index 9bffaa79f5..fc2cb09471 100644
--- a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
@@ -82,9 +82,9 @@ END
   }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: <<END
-$$lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)$$
-$$m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t$$
-$$v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t$$
-$$variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)$$
+$$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+$$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$
+$$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+$$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 END
 }
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 178eddc664..1f2c40f18e 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -54,11 +54,11 @@ class AdamOptimizer(optimizer.Optimizer):
 
     ```
     $$t <- t + 1$$
-    $$lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)$$
+    $$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
 
-    $$m_t <- beta1 * m_{t-1} + (1 - beta1) * g$$
-    $$v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g$$
-    $$variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)$$
+    $$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$
+    $$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+    $$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
     ```
 
     The default value of 1e-8 for epsilon might not be a good default in
-- 
GitLab


From aa7bb027a7cac837a3b774e9f443139b85c82aa8 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Tue, 3 Apr 2018 00:18:32 +0800
Subject: [PATCH 016/450] Fix minor typo

---
 .../api_def/base_api/api_def_ResourceApplyAdam.pbtxt     | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
index 109b68e472..5c60fa3aa1 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
@@ -76,9 +76,8 @@ END
   }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: <<END
-$$lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)$$
-$$m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t$$
-$$v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t$$
-$$variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)$$
-END
+$$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+$$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$
+$$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+$$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 }
-- 
GitLab


From 0b9eedd684b4085ab65d60627efa8594a92a0b98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Sat, 7 Apr 2018 11:47:03 +0800
Subject: [PATCH 017/450] TST: add test case for duplicate indices

---
 .../kernel_tests/scatter_nd_ops_test.py       | 26 +++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 03b2f892c6..dfe9600dbb 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -366,13 +366,35 @@ class ScatterNdTest(test.TestCase):
 
   def testString(self):
     indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
-    updates = constant_op.constant(["four", "three", "one", "seven"], dtype=dtypes.string)
+    updates = constant_op.constant(["four", "three", "one", "seven"],
+                                   dtype=dtypes.string)
     expected = np.array(["", "one", "", "three", "four", "", "", "seven"])
     scatter = self.scatter_nd(indices, updates, shape=(8,))
+    with self.test_session() as sess:
+      result = sess.run(scatter)
+      self.assertAllEqual(expected, result)
 
+    # Same indice is updated twice by same value.
+    indices = constant_op.constant([[4], [3], [3], [7]], dtype=dtypes.int32)
+    updates = constant_op.constant(["a", "b", "b", "c"],
+                                   dtype=dtypes.string)
+    expected = np.array(["", "", "", "bb", "a", "", "", "c"])
+    scatter = self.scatter_nd(indices, updates, shape=(8,))
+    with self.test_session() as sess:
+      result = sess.run(scatter)
+      self.assertAllEqual(expected, result)
+
+    # Same indice is updated twice by different value.
+    indices = constant_op.constant([[4], [3], [3], [7]], dtype=dtypes.int32)
+    updates = constant_op.constant(["a", "b", "c", "d"],
+                                   dtype=dtypes.string)
+    expected = [np.array(["", "", "", "bc", "a", "", "", "d"]),
+                np.array(["", "", "", "cb", "a", "", "", "d"])]
+    scatter = self.scatter_nd(indices, updates, shape=(8,))
     with self.test_session() as sess:
       result = sess.run(scatter)
-      self.assertTrue(np.array_equal(result, expected))
+      self.assertTrue(np.array_equal(result, expected[0]) or
+                      np.array_equal(result, expected[1]))
 
   def testRank3ValidShape(self):
     indices = array_ops.zeros([2, 2, 2], dtypes.int32)
-- 
GitLab


From 9e1bbbc0fb770f077d9de295b53181e3592f1d24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Sat, 7 Apr 2018 12:07:11 +0800
Subject: [PATCH 018/450] DOC: remove the misleading 'empty tensor'

---
 tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
index 4e95895f54..58753a651a 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
@@ -25,7 +25,7 @@ A new tensor with the given shape and updates applied according
 to the indices.
 END
   }
-  summary: "Scatter `updates` into a new empty tensor according to `indices`."
+  summary: "Scatter `updates` into a new tensor according to `indices`."
   description: <<END
 Creates a new tensor by applying sparse `updates` to individual values or
 slices within a tensor (initially zero for numeric, empty for string) of
-- 
GitLab


From 61994c21f5ddee273e0d79b08444b48858e11bfd Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Tue, 10 Apr 2018 20:00:22 +0800
Subject: [PATCH 019/450] Remove breaking ``` for math equations

---
 tensorflow/contrib/optimizer_v2/adam.py | 4 ----
 tensorflow/python/training/adam.py      | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py
index 9bc160c0b9..a38c98f471 100644
--- a/tensorflow/contrib/optimizer_v2/adam.py
+++ b/tensorflow/contrib/optimizer_v2/adam.py
@@ -40,23 +40,19 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):
 
     Initialization:
 
-    ```
     \\(m_0 <- 0\\) (Initialize initial 1st moment vector)
     \\(v_0 <- 0\\) (Initialize initial 2nd moment vector)
     \\(t <- 0\\) (Initialize timestep)
-    ```
 
     The update rule for `variable` with gradient `g` uses an optimization
     described at the end of section2 of the paper:
 
-    ```
     $$t <- t + 1$$
     $$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
 
     $$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$
     $$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
     $$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
-    ```
 
     The default value of 1e-8 for epsilon might not be a good default in
     general. For example, when training an Inception network on ImageNet a
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 1f2c40f18e..dc0f1aba09 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -43,23 +43,19 @@ class AdamOptimizer(optimizer.Optimizer):
 
     Initialization:
 
-    ```
     \\(m_0 <- 0\\) (Initialize initial 1st moment vector)
     \\(v_0 <- 0\\) (Initialize initial 2nd moment vector)
     \\(t <- 0\\) (Initialize timestep)
-    ```
 
     The update rule for `variable` with gradient `g` uses an optimization
     described at the end of section2 of the paper:
 
-    ```
     $$t <- t + 1$$
     $$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
 
     $$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$
     $$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
     $$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
-    ```
 
     The default value of 1e-8 for epsilon might not be a good default in
     general. For example, when training an Inception network on ImageNet a
-- 
GitLab


From a6bc4afc97ce7a2a285e549822d06f4cbf51c4ef Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Fri, 13 Apr 2018 10:19:24 -0700
Subject: [PATCH 020/450] Cherry-picking PR #18444 into r1.8

---
 tensorflow/contrib/tensorrt/BUILD                           | 2 +-
 .../contrib/tensorrt/resources/trt_resource_manager.cc      | 6 ++++++
 .../contrib/tensorrt/resources/trt_resource_manager.h       | 6 +-----
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 2f316767b3..fd3582e175 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -52,7 +52,6 @@ tf_custom_op_library(
         "ops/trt_engine_op.cc",
     ],
     deps = [
-        ":trt_engine_op_kernel",
         ":trt_shape_function",
         "//tensorflow/core:lib_proto_parsing",
     ] + if_tensorrt([
@@ -183,6 +182,7 @@ tf_py_wrap_cc(
     copts = tf_copts(),
     deps = [
         ":trt_conversion",
+        ":trt_engine_op_kernel",
         "//tensorflow/core:framework_lite",
         "//util/python:python_headers",
     ],
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
index e663eed4dd..9c3698e5d1 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
+++ b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
@@ -19,6 +19,12 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 
+std::shared_ptr<TRTResourceManager>
+tensorflow::tensorrt::TRTResourceManager::instance() {
+  static std::shared_ptr<TRTResourceManager> instance_(new TRTResourceManager);
+  return instance_;
+}
+
 std::shared_ptr<tensorflow::ResourceMgr>
 tensorflow::tensorrt::TRTResourceManager::getManager(const string& op_name) {
   // mutex is held for lookup only. Most instantiations where mutex will be held
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
index 5f8ad491d3..bc15b51e05 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
@@ -29,11 +29,7 @@ class TRTResourceManager {
   TRTResourceManager() = default;
 
  public:
-  static std::shared_ptr<TRTResourceManager> instance() {
-    static std::shared_ptr<TRTResourceManager> instance_(
-        new TRTResourceManager);
-    return instance_;
-  }
+  static std::shared_ptr<TRTResourceManager> instance();
   // returns a manager for given op, if it doesn't exists it creates one
   std::shared_ptr<tensorflow::ResourceMgr> getManager(const string& op_name);
 
-- 
GitLab


From 76a73f899cdc5e19ef2b99373524dcb4dba0bd2b Mon Sep 17 00:00:00 2001
From: Younghee Kwon <youngheek@google.com>
Date: Mon, 9 Apr 2018 17:45:13 -0700
Subject: [PATCH 021/450] boosted_trees: early stop hooks are fixed to stop at
 the right moment  by reading tensor values in a separate session after
 train_op run. PiperOrigin-RevId: 192217338

---
 .../python/estimator/boosted_trees_test.py    | 97 +++++++------------
 .../python/estimator/canned/boosted_trees.py  | 33 +++----
 .../estimator/canned/boosted_trees_test.py    | 63 +++++-------
 3 files changed, 71 insertions(+), 122 deletions(-)

diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
index e99a87f3b3..eee5910687 100644
--- a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.estimator.python.estimator import boosted_trees
+from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
 from tensorflow.python.estimator.canned import boosted_trees as canned_boosted_trees
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column
@@ -69,10 +70,18 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
         for i in range(NUM_FEATURES)
     }
 
-  def _assert_checkpoint(self, model_dir, expected_global_step):
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
+  def _assert_checkpoint(self, model_dir, global_step, finalized_trees,
+                         attempted_layers):
+    reader = checkpoint_utils.load_checkpoint(model_dir)
+    self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP))
+    serialized = reader.get_tensor('boosted_trees:0_serialized')
+    ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+    ensemble_proto.ParseFromString(serialized)
+    self.assertEqual(
+        finalized_trees,
+        sum([1 for t in ensemble_proto.tree_metadata if t.is_finalized]))
+    self.assertEqual(attempted_layers,
+                     ensemble_proto.growing_metadata.num_layers_attempted)
 
   def testTrainAndEvaluateEstimator(self):
     input_fn = _make_train_input_fn(is_classification=False)
@@ -88,9 +97,10 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(input_fn, steps=num_steps)
-    self._assert_checkpoint(est.model_dir, 11)
+    self._assert_checkpoint(
+        est.model_dir, global_step=10, finalized_trees=2, attempted_layers=10)
     eval_res = est.evaluate(input_fn=input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 0.913176)
+    self.assertAllClose(eval_res['average_loss'], 1.008551)
 
   def testInferEstimator(self):
     train_input_fn = _make_train_input_fn(is_classification=False)
@@ -108,31 +118,13 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(train_input_fn, steps=num_steps)
-    self._assert_checkpoint(est.model_dir, 6)
-
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    # Validate predictions.
     predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertEquals(5, len(predictions))
-    self.assertAllClose([0.703549], predictions[0]['predictions'])
-    self.assertAllClose([0.266539], predictions[1]['predictions'])
-    self.assertAllClose([0.256479], predictions[2]['predictions'])
-    self.assertAllClose([1.088732], predictions[3]['predictions'])
-    self.assertAllClose([1.901732], predictions[4]['predictions'])
-
-
-class BoostedTreesClassifierTrainInMemoryTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._feature_columns = {
-        feature_column.bucketized_column(
-            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
-            BUCKET_BOUNDARIES)
-        for i in range(NUM_FEATURES)
-    }
-
-  def _assert_checkpoint(self, model_dir, expected_global_step):
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
 
   def testBinaryClassifierTrainInMemoryAndEvalAndInfer(self):
     train_input_fn = _make_train_input_fn(is_classification=True)
@@ -145,36 +137,16 @@ class BoostedTreesClassifierTrainInMemoryTest(test_util.TensorFlowTestCase):
         n_trees=1,
         max_depth=5)
     # It will stop after 5 steps because of the max depth and num trees.
-    self._assert_checkpoint(est.model_dir, 6)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
 
     # Check eval.
     eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
     self.assertAllClose(eval_res['accuracy'], 1.0)
-
-    # Check predict that all labels are correct.
+    # Validate predictions.
     predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertEquals(5, len(predictions))
-    self.assertAllClose([0], predictions[0]['class_ids'])
-    self.assertAllClose([1], predictions[1]['class_ids'])
-    self.assertAllClose([1], predictions[2]['class_ids'])
-    self.assertAllClose([0], predictions[3]['class_ids'])
-    self.assertAllClose([0], predictions[4]['class_ids'])
-
-
-class BoostedTreesRegressorTrainInMemoryTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._feature_columns = {
-        feature_column.bucketized_column(
-            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
-            BUCKET_BOUNDARIES)
-        for i in range(NUM_FEATURES)
-    }
-
-  def _assert_checkpoint(self, model_dir, expected_global_step):
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
 
   def testRegressorTrainInMemoryAndEvalAndInfer(self):
     train_input_fn = _make_train_input_fn(is_classification=False)
@@ -187,20 +159,17 @@ class BoostedTreesRegressorTrainInMemoryTest(test_util.TensorFlowTestCase):
         n_trees=1,
         max_depth=5)
     # It will stop after 5 steps because of the max depth and num trees.
-    self._assert_checkpoint(est.model_dir, 6)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
 
     # Check eval.
     eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 2.2136638)
-
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
     # Validate predictions.
     predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertEquals(5, len(predictions))
-    self.assertAllClose([0.703549], predictions[0]['predictions'])
-    self.assertAllClose([0.266539], predictions[1]['predictions'])
-    self.assertAllClose([0.256479], predictions[2]['predictions'])
-    self.assertAllClose([1.088732], predictions[3]['predictions'])
-    self.assertAllClose([1.901732], predictions[4]['predictions'])
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 500ea03ea7..c5d5455b1a 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -209,8 +209,8 @@ class _CacheTrainingStatesUsingVariables(object):
         name='cache_insert')
 
 
-class StopAtAttemptsHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop at the number of trees."""
+class _StopAtAttemptsHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop at the number of attempts."""
 
   def __init__(self, num_finalized_trees_tensor, num_attempted_layers_tensor,
                max_trees, max_depth):
@@ -224,25 +224,17 @@ class StopAtAttemptsHook(session_run_hook.SessionRunHook):
         [self._num_finalized_trees_tensor, self._num_attempted_layers_tensor])
 
   def after_run(self, run_context, run_values):
+    # num_* tensors should be retrieved by a separate session than the training
+    # one, in order to read the values after growing.
+    # So, if it's approaching to the limit, get the actual value by additional
+    # session.
     num_finalized_trees, num_attempted_layers = run_values.results
+    if (num_finalized_trees >= self._max_trees - 1 or
+        num_attempted_layers > 2 * self._max_trees * self._max_depth - 1):
+      num_finalized_trees, num_attempted_layers = run_context.session.run(
+          [self._num_finalized_trees_tensor, self._num_attempted_layers_tensor])
     if (num_finalized_trees >= self._max_trees or
-        1.0 * num_attempted_layers / self._max_depth > 2 * self._max_trees):
-      run_context.request_stop()
-
-
-class StopAtNumTreesHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop at the number of trees."""
-
-  def __init__(self, num_trees_tensor, max_trees):
-    self._num_trees_tensor = num_trees_tensor
-    self._max_trees = max_trees
-
-  def before_run(self, run_context):
-    return session_run_hook.SessionRunArgs(self._num_trees_tensor)
-
-  def after_run(self, run_context, run_values):
-    num_trees = run_values.results
-    if num_trees > self._max_trees:
+        num_attempted_layers > 2 * self._max_trees * self._max_depth):
       run_context.request_stop()
 
 
@@ -468,7 +460,8 @@ def _bt_model_fn(
     # Add an early stop hook.
     estimator_spec = estimator_spec._replace(
         training_hooks=estimator_spec.training_hooks +
-        (StopAtNumTreesHook(num_trees, tree_hparams.n_trees),))
+        (_StopAtAttemptsHook(num_finalized_trees, num_attempted_layers,
+                             tree_hparams.n_trees, tree_hparams.max_depth),))
   return estimator_spec
 
 
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
index 01e5cc7a5d..625745a3f9 100644
--- a/tensorflow/python/estimator/canned/boosted_trees_test.py
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -69,7 +69,7 @@ def _make_train_input_fn(is_classification):
   return _input_fn
 
 
-class BoostedTreesClassifierTest(test_util.TensorFlowTestCase):
+class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
     self._feature_columns = {
@@ -79,10 +79,18 @@ class BoostedTreesClassifierTest(test_util.TensorFlowTestCase):
         for i in range(NUM_FEATURES)
     }
 
-  def _assert_checkpoint(self, model_dir, expected_global_step):
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
+  def _assert_checkpoint(self, model_dir, global_step, finalized_trees,
+                         attempted_layers):
+    reader = checkpoint_utils.load_checkpoint(model_dir)
+    self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP))
+    serialized = reader.get_tensor('boosted_trees:0_serialized')
+    ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+    ensemble_proto.ParseFromString(serialized)
+    self.assertEqual(
+        finalized_trees,
+        sum([1 for t in ensemble_proto.tree_metadata if t.is_finalized]))
+    self.assertEqual(attempted_layers,
+                     ensemble_proto.growing_metadata.num_layers_attempted)
 
   def testTrainAndEvaluateBinaryClassifier(self):
     input_fn = _make_train_input_fn(is_classification=True)
@@ -97,7 +105,8 @@ class BoostedTreesClassifierTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(input_fn, steps=num_steps)
-    self._assert_checkpoint(est.model_dir, 6)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
     eval_res = est.evaluate(input_fn=input_fn, steps=1)
     self.assertAllClose(eval_res['accuracy'], 1.0)
 
@@ -118,29 +127,9 @@ class BoostedTreesClassifierTest(test_util.TensorFlowTestCase):
     est.train(train_input_fn, steps=num_steps)
 
     predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertEquals(5, len(predictions))
     # All labels are correct.
-    self.assertAllClose([0], predictions[0]['class_ids'])
-    self.assertAllClose([1], predictions[1]['class_ids'])
-    self.assertAllClose([1], predictions[2]['class_ids'])
-    self.assertAllClose([0], predictions[3]['class_ids'])
-    self.assertAllClose([0], predictions[4]['class_ids'])
-
-
-class BoostedTreesRegressionTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._feature_columns = {
-        feature_column.bucketized_column(
-            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
-            BUCKET_BOUNDARIES)
-        for i in range(NUM_FEATURES)
-    }
-
-  def _assert_checkpoint(self, model_dir, expected_global_step):
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
 
   def testTrainAndEvaluateRegressor(self):
     input_fn = _make_train_input_fn(is_classification=False)
@@ -155,9 +144,10 @@ class BoostedTreesRegressionTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(input_fn, steps=num_steps)
-    self._assert_checkpoint(est.model_dir, 11)
+    self._assert_checkpoint(
+        est.model_dir, global_step=10, finalized_trees=2, attempted_layers=10)
     eval_res = est.evaluate(input_fn=input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 0.913176)
+    self.assertAllClose(eval_res['average_loss'], 1.008551)
 
   def testInferRegressor(self):
     train_input_fn = _make_train_input_fn(is_classification=False)
@@ -174,16 +164,13 @@ class BoostedTreesRegressionTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(train_input_fn, steps=num_steps)
-    self._assert_checkpoint(est.model_dir, 6)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
 
     predictions = list(est.predict(input_fn=predict_input_fn))
-
-    self.assertEquals(5, len(predictions))
-    self.assertAllClose([0.703549], predictions[0]['predictions'])
-    self.assertAllClose([0.266539], predictions[1]['predictions'])
-    self.assertAllClose([0.256479], predictions[2]['predictions'])
-    self.assertAllClose([1.088732], predictions[3]['predictions'])
-    self.assertAllClose([1.901732], predictions[4]['predictions'])
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
 
 
 class ModelFnTests(test_util.TensorFlowTestCase):
-- 
GitLab


From 3e1739c0c3c6cd3b74879f3e1872dd1354401e56 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 15:37:49 -0700
Subject: [PATCH 022/450] Revealing the range of node ids in the latest layer
 via resource' state

PiperOrigin-RevId: 192520351
---
 ...tedTreesCalculateBestGainsPerFeature.pbtxt |  4 +-
 ...pi_def_BoostedTreesGetEnsembleStates.pbtxt | 12 +++++-
 .../kernels/boosted_trees/boosted_trees.proto |  4 ++
 .../kernels/boosted_trees/resource_ops.cc     | 12 ++++++
 .../core/kernels/boosted_trees/resources.h    | 20 ++++++++++
 .../core/kernels/boosted_trees/stats_ops.cc   |  6 +--
 .../kernels/boosted_trees/training_ops.cc     |  8 ++++
 tensorflow/core/ops/boosted_trees_ops.cc      |  2 +
 .../core/ops/compat/ops_history.v1.pbtxt      |  4 ++
 .../python/estimator/canned/boosted_trees.py  |  9 ++---
 .../estimator/canned/boosted_trees_test.py    | 12 ++++++
 .../boosted_trees/resource_ops_test.py        | 31 +++++++++-----
 .../boosted_trees/stats_ops_test.py           |  8 ++--
 .../boosted_trees/training_ops_test.py        | 40 +++++++++++++++++--
 tensorflow/python/ops/boosted_trees_ops.py    | 15 ++++---
 15 files changed, 150 insertions(+), 37 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
index b1921e3507..62876a293c 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
@@ -4,7 +4,7 @@ op {
   in_arg {
     name: "node_id_range"
     description: <<END
-A Rank 1 tensor (shape=[2]) to specify the range [first, last] of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1]+1)` (Note that the last index node_id_range[1] is inclusive).
+A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
 END
   }
   in_arg {
@@ -84,4 +84,4 @@ In this manner, the output is the best split per features and per node, so that
 The length of output lists are all of the same length, `num_features`.
 The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
 END
-}
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
index ef45a92498..4377125224 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
@@ -31,5 +31,13 @@ END
 The number of layers we attempted to build (but not necessarily succeeded).
 END
   }
-  summary: "Retrieves the tree ensemble resource stamp token."
-}
+  out_arg {
+    name: "last_layer_nodes_range"
+    description: <<END
+Rank size 2 tensor that contains start and end ids of the nodes in the latest
+layer.
+END
+
+  }
+  summary: "Retrieves the tree ensemble resource stamp token, number of trees and growing statistics."
+}
\ No newline at end of file
diff --git a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
index 106ceedc00..55599de731 100644
--- a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
+++ b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
@@ -100,6 +100,10 @@ message GrowingMetadata {
   // Number of layers that we have attempted to build. After pruning, these
   // layers might have been removed.
   int64 num_layers_attempted = 2;
+  // The start (inclusive) and end (exclusive) ids of the nodes in the latest
+  // layer of the latest tree.
+  int32 last_layer_node_start = 3;
+  int32 last_layer_node_end = 4;
 }
 
 // TreeEnsemble describes an ensemble of decision trees.
diff --git a/tensorflow/core/kernels/boosted_trees/resource_ops.cc b/tensorflow/core/kernels/boosted_trees/resource_ops.cc
index f49242d856..563f7b8b08 100644
--- a/tensorflow/core/kernels/boosted_trees/resource_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/resource_ops.cc
@@ -99,6 +99,7 @@ class BoostedTreesGetEnsembleStatesOp : public OpKernel {
     Tensor* output_num_trees_t = nullptr;
     Tensor* output_num_finalized_trees_t = nullptr;
     Tensor* output_num_attempted_layers_t = nullptr;
+    Tensor* output_last_layer_nodes_range_t = nullptr;
 
     OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape(),
                                                      &output_stamp_token_t));
@@ -110,11 +111,22 @@ class BoostedTreesGetEnsembleStatesOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(3, TensorShape(),
                                             &output_num_attempted_layers_t));
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                4, {2}, &output_last_layer_nodes_range_t));
 
     output_stamp_token_t->scalar<int64>()() = tree_ensemble_resource->stamp();
     output_num_trees_t->scalar<int32>()() = num_trees;
     output_num_finalized_trees_t->scalar<int32>()() = num_finalized_trees;
     output_num_attempted_layers_t->scalar<int32>()() = num_attempted_layers;
+
+    int32 range_start;
+    int32 range_end;
+    tree_ensemble_resource->GetLastLayerNodesRange(&range_start, &range_end);
+
+    output_last_layer_nodes_range_t->vec<int32>()(0) = range_start;
+    // For a completely empty ensemble, this will be 0. To make it a valid range
+    // we add this max cond.
+    output_last_layer_nodes_range_t->vec<int32>()(1) = std::max(1, range_end);
   }
 };
 
diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h
index c82588b950..561ca3a18a 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.h
+++ b/tensorflow/core/kernels/boosted_trees/resources.h
@@ -93,6 +93,26 @@ class BoostedTreesEnsembleResource : public StampedResource {
         new_num_layers);
   }
 
+  void UpdateLastLayerNodesRange(const int32 node_range_start,
+                                 int32 node_range_end) const {
+    tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_start(
+        node_range_start);
+    tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_end(
+        node_range_end);
+  }
+
+  void GetLastLayerNodesRange(int32* node_range_start,
+                              int32* node_range_end) const {
+    *node_range_start =
+        tree_ensemble_->growing_metadata().last_layer_node_start();
+    *node_range_end = tree_ensemble_->growing_metadata().last_layer_node_end();
+  }
+
+  int64 GetNumNodes(const int32 tree_id) {
+    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+    return tree_ensemble_->trees(tree_id).nodes_size();
+  }
+
   void UpdateGrowingMetadata() const;
 
   int32 GetNumLayersAttempted() {
diff --git a/tensorflow/core/kernels/boosted_trees/stats_ops.cc b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
index 33fdab6a86..16e65cf284 100644
--- a/tensorflow/core/kernels/boosted_trees/stats_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
@@ -42,8 +42,8 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
     const Tensor* node_id_range_t;
     OP_REQUIRES_OK(context, context->input("node_id_range", &node_id_range_t));
     const auto node_id_range = node_id_range_t->vec<int32>();
-    int32 node_id_first = node_id_range(0);
-    int32 node_id_last = node_id_range(1);  // inclusive.
+    const int32 node_id_first = node_id_range(0);  // inclusive
+    const int32 node_id_last = node_id_range(1);   // exclusive
     // stats_summary_list
     OpInputList stats_summary_list;
     OP_REQUIRES_OK(context, context->input_list("stats_summary_list",
@@ -86,7 +86,7 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
       std::vector<int32> output_thresholds;
       std::vector<float> output_left_node_contribs;
       std::vector<float> output_right_node_contribs;
-      for (int node_id = node_id_first; node_id <= node_id_last; ++node_id) {
+      for (int node_id = node_id_first; node_id < node_id_last; ++node_id) {
         // Calculate gains.
         cum_grad.clear();
         cum_hess.clear();
diff --git a/tensorflow/core/kernels/boosted_trees/training_ops.cc b/tensorflow/core/kernels/boosted_trees/training_ops.cc
index b9ded4054a..67cac14c52 100644
--- a/tensorflow/core/kernels/boosted_trees/training_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/training_ops.cc
@@ -101,6 +101,7 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
             << current_tree << " of ensemble of " << current_tree + 1
             << " trees.";
     bool split_happened = false;
+    int32 node_id_start = ensemble_resource->GetNumNodes(current_tree);
     // Add the splits to the tree.
     for (auto& split_entry : best_splits) {
       const int32 node_id = split_entry.first;
@@ -139,11 +140,15 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
           right_contrib, &left_node_id, &right_node_id);
       split_happened = true;
     }
+    int32 node_id_end = ensemble_resource->GetNumNodes(current_tree);
     if (split_happened) {
       // Update growable tree metadata.
       ensemble_resource->SetNumLayersGrown(current_tree, new_num_layers);
       // Finalize the tree if needed.
       if (ensemble_resource->GetNumLayersGrown(current_tree) >= max_depth_) {
+        // If the tree is finalized, next growing will start from node 0;
+        node_id_start = 0;
+        node_id_end = 1;
         ensemble_resource->SetIsFinalized(current_tree, true);
         if (pruning_mode_ == kPostPruning) {
           ensemble_resource->PostPruneTree(current_tree);
@@ -153,6 +158,9 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
           ensemble_resource->AddNewTree(kLayerByLayerTreeWeight);
         }
       }
+      // If we managed to split, update the node range. If we didn't, don't
+      // update as we will try to split the same nodes with new instances.
+      ensemble_resource->UpdateLastLayerNodesRange(node_id_start, node_id_end);
     }
   }
 
diff --git a/tensorflow/core/ops/boosted_trees_ops.cc b/tensorflow/core/ops/boosted_trees_ops.cc
index 297e94655f..8af4903418 100644
--- a/tensorflow/core/ops/boosted_trees_ops.cc
+++ b/tensorflow/core/ops/boosted_trees_ops.cc
@@ -128,6 +128,7 @@ REGISTER_OP("BoostedTreesGetEnsembleStates")
     .Output("num_trees: int32")
     .Output("num_finalized_trees: int32")
     .Output("num_attempted_layers: int32")
+    .Output("last_layer_nodes_range: int32")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused_input;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
@@ -135,6 +136,7 @@ REGISTER_OP("BoostedTreesGetEnsembleStates")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       c->set_output(3, c->Scalar());
+      c->set_output(4, c->Vector(2));
       return Status::OK();
     });
 
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 026bfa89cf..2f6f588d2c 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -10861,6 +10861,10 @@ op {
     name: "num_attempted_layers"
     type: DT_INT32
   }
+  output_arg {
+    name: "last_layer_nodes_range"
+    type: DT_INT32
+  }
   is_stateful: true
 }
 op {
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index c5d5455b1a..58af59dbb1 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -349,8 +349,8 @@ def _bt_model_fn(
             array_ops.zeros(
                 [batch_size, head.logits_dimension], dtype=dtypes.float32))
       with ops.control_dependencies([ensemble_reload]):
-        (stamp_token, num_trees, num_finalized_trees,
-         num_attempted_layers) = local_tree_ensemble.get_states()
+        (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+         last_layer_nodes_range) = local_tree_ensemble.get_states()
         summary.scalar('ensemble/num_trees', num_trees)
         summary.scalar('ensemble/num_finalized_trees', num_finalized_trees)
         summary.scalar('ensemble/num_attempted_layers', num_attempted_layers)
@@ -393,10 +393,7 @@ def _bt_model_fn(
         (node_ids_per_feature, gains_list, thresholds_list,
          left_node_contribs_list, right_node_contribs_list) = (
              boosted_trees_ops.calculate_best_gains_per_feature(
-                 node_id_range=array_ops.stack([
-                     math_ops.reduce_min(node_ids),
-                     math_ops.reduce_max(node_ids)
-                 ]),
+                 node_id_range=last_layer_nodes_range,
                  stats_summary_list=stats_summary_list,
                  l1=tree_hparams.l1,
                  l2=tree_hparams.l2,
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
index 625745a3f9..7823ef8410 100644
--- a/tensorflow/python/estimator/canned/boosted_trees_test.py
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -223,6 +223,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
         """
     second_round = """
@@ -307,6 +309,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
         """
     third_round = """
@@ -407,6 +411,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 2
           num_layers_attempted: 3
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
         """
     return (first_round, second_round, third_round)
@@ -444,6 +450,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
         """
     second_round = """
@@ -528,6 +536,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
         """
     third_round = """
@@ -628,6 +638,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 2
           num_layers_attempted: 3
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
         """
     return (first_round, second_round, third_round)
diff --git a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
index a223241e89..d5f0c22d6e 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
@@ -36,16 +36,18 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
       resources.initialize_resources(resources.shared_resources()).run()
       stamp_token = ensemble.get_stamp_token()
       self.assertEqual(0, stamp_token.eval())
-      (_, num_trees, num_finalized_trees,
-       num_attempted_layers) = ensemble.get_states()
+      (_, num_trees, num_finalized_trees, num_attempted_layers,
+       nodes_range) = ensemble.get_states()
       self.assertEqual(0, num_trees.eval())
       self.assertEqual(0, num_finalized_trees.eval())
       self.assertEqual(0, num_attempted_layers.eval())
+      self.assertAllEqual([0, 1], nodes_range.eval())
 
   def testCreateWithProto(self):
     with self.test_session():
       ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      text_format.Merge("""
+      text_format.Merge(
+          """
         trees {
           nodes {
             bucketized_split {
@@ -141,6 +143,8 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 2
           num_layers_attempted: 6
+          last_layer_node_start: 16
+          last_layer_node_end: 19
         }
       """, ensemble_proto)
       ensemble = boosted_trees_ops.TreeEnsemble(
@@ -148,28 +152,31 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
           stamp_token=7,
           serialized_proto=ensemble_proto.SerializeToString())
       resources.initialize_resources(resources.shared_resources()).run()
-      (stamp_token, num_trees, num_finalized_trees,
-       num_attempted_layers) = ensemble.get_states()
+      (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+       nodes_range) = ensemble.get_states()
       self.assertEqual(7, stamp_token.eval())
       self.assertEqual(2, num_trees.eval())
       self.assertEqual(1, num_finalized_trees.eval())
       self.assertEqual(6, num_attempted_layers.eval())
+      self.assertAllEqual([16, 19], nodes_range.eval())
 
   def testSerializeDeserialize(self):
     with self.test_session():
       # Initialize.
       ensemble = boosted_trees_ops.TreeEnsemble('ensemble', stamp_token=5)
       resources.initialize_resources(resources.shared_resources()).run()
-      (stamp_token, num_trees, num_finalized_trees,
-       num_attempted_layers) = ensemble.get_states()
+      (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+       nodes_range) = ensemble.get_states()
       self.assertEqual(5, stamp_token.eval())
       self.assertEqual(0, num_trees.eval())
       self.assertEqual(0, num_finalized_trees.eval())
       self.assertEqual(0, num_attempted_layers.eval())
+      self.assertAllEqual([0, 1], nodes_range.eval())
 
       # Deserialize.
       ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      text_format.Merge("""
+      text_format.Merge(
+          """
         trees {
           nodes {
             bucketized_split {
@@ -201,6 +208,8 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 5
+          last_layer_node_start: 3
+          last_layer_node_end: 7
         }
       """, ensemble_proto)
       with ops.control_dependencies([
@@ -208,13 +217,15 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
               stamp_token=3,
               serialized_proto=ensemble_proto.SerializeToString())
       ]):
-        (stamp_token, num_trees, num_finalized_trees,
-         num_attempted_layers) = ensemble.get_states()
+        (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+         nodes_range) = ensemble.get_states()
       self.assertEqual(3, stamp_token.eval())
       self.assertEqual(1, num_trees.eval())
       # This reads from metadata, not really counting the layers.
       self.assertEqual(5, num_attempted_layers.eval())
       self.assertEqual(0, num_finalized_trees.eval())
+      self.assertAllEqual([3, 7], nodes_range.eval())
+
 
       # Serialize.
       new_ensemble_proto = boosted_trees_pb2.TreeEnsemble()
diff --git a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
index a54cc43517..4d09cf94d4 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
@@ -29,7 +29,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     """Testing Gain calculation without any regularization."""
     with self.test_session() as sess:
       max_splits = 7
-      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
       stats_summary_list = [
           [
               [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
@@ -76,7 +76,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     """Testing Gain calculation with L2."""
     with self.test_session() as sess:
       max_splits = 7
-      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
       stats_summary_list = [
           [
               [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
@@ -123,7 +123,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     """Testing Gain calculation with L1."""
     with self.test_session() as sess:
       max_splits = 7
-      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
       stats_summary_list = [
           [
               [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
@@ -173,7 +173,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     """Testing Gain calculation with L2."""
     with self.test_session() as sess:
       max_splits = 7
-      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
       stats_summary_list = [
           [
               [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
diff --git a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
index 4226ff75c2..d6c0047747 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
@@ -132,6 +132,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -314,6 +316,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -461,6 +465,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 2
           num_layers_attempted: 2
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -615,6 +621,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 3
+          last_layer_node_end: 5
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -624,7 +632,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
     """Test that the metadata is updated even though we can't split."""
     with self.test_session() as session:
       tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
-      text_format.Merge("""
+      text_format.Merge(
+          """
         trees {
           nodes {
             bucketized_split {
@@ -655,6 +664,9 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
+
         }
       """, tree_ensemble_config)
 
@@ -685,7 +697,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
 
       # Expect no new splits created, but attempted (global) stats updated. Meta
       # data for this tree should not be updated (we didn't succeed building a
-      # layer.
+      # layer. Node ranges don't change.
       new_stamp, serialized = session.run(tree_ensemble.serialize())
       tree_ensemble = boosted_trees_pb2.TreeEnsemble()
       tree_ensemble.ParseFromString(serialized)
@@ -721,6 +733,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -730,7 +744,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
     """Test metadata is updated correctly when no split due to prepruning."""
     with self.test_session() as session:
       tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
-      text_format.Merge("""
+      text_format.Merge(
+          """
         trees {
           nodes {
             bucketized_split {
@@ -761,6 +776,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """, tree_ensemble_config)
 
@@ -851,6 +868,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -941,6 +960,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -1046,6 +1067,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 3
+          last_layer_node_end: 7
         }
        """
       self.assertEqual(new_stamp, 2)
@@ -1179,6 +1202,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 3
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
        """
       self.assertEqual(new_stamp, 3)
@@ -1268,6 +1293,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -1307,7 +1334,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       # Expect the ensemble to be empty as post-pruning will prune
       # the entire finalized tree.
       self.assertEqual(new_stamp, 2)
-      self.assertProtoEquals("""
+      self.assertProtoEquals(
+          """
       trees {
         nodes {
           leaf {
@@ -1359,6 +1387,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       growing_metadata {
         num_trees_attempted: 1
         num_layers_attempted: 2
+        last_layer_node_start: 0
+        last_layer_node_end: 1
       }
       """, res_ensemble)
 
@@ -1455,6 +1485,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
       """
       self.assertEqual(new_stamp, 1)
diff --git a/tensorflow/python/ops/boosted_trees_ops.py b/tensorflow/python/ops/boosted_trees_ops.py
index 174d00987f..2a2bcdd9d6 100644
--- a/tensorflow/python/ops/boosted_trees_ops.py
+++ b/tensorflow/python/ops/boosted_trees_ops.py
@@ -115,7 +115,7 @@ class TreeEnsemble(object):
 
   def get_stamp_token(self):
     """Returns the current stamp token of the resource."""
-    stamp_token, _, _, _ = (
+    stamp_token, _, _, _, _ = (
         gen_boosted_trees_ops.boosted_trees_get_ensemble_states(
             self.resource_handle))
     return stamp_token
@@ -124,17 +124,20 @@ class TreeEnsemble(object):
     """Returns states of the tree ensemble.
 
     Returns:
-      stamp_token, num_trees, num_finalized_trees, num_attempted_layers.
+      stamp_token, num_trees, num_finalized_trees, num_attempted_layers and
+      range of the nodes in the latest layer.
     """
-    stamp_token, num_trees, num_finalized_trees, num_attempted_layers = (
-        gen_boosted_trees_ops.boosted_trees_get_ensemble_states(
-            self.resource_handle))
+    (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+     nodes_range) = (
+         gen_boosted_trees_ops.boosted_trees_get_ensemble_states(
+             self.resource_handle))
     # Use identity to give names.
     return (array_ops.identity(stamp_token, name='stamp_token'),
             array_ops.identity(num_trees, name='num_trees'),
             array_ops.identity(num_finalized_trees, name='num_finalized_trees'),
             array_ops.identity(
-                num_attempted_layers, name='num_attempted_layers'))
+                num_attempted_layers, name='num_attempted_layers'),
+            array_ops.identity(nodes_range, name='last_layer_nodes_range'))
 
   def serialize(self):
     """Serializes the ensemble into proto and returns the serialized proto.
-- 
GitLab


From 33c737b70d42e05cabc43b4c6e778e988b6d0a9e Mon Sep 17 00:00:00 2001
From: Younghee Kwon <youngheek@google.com>
Date: Wed, 11 Apr 2018 16:59:45 -0700
Subject: [PATCH 023/450] boosted_trees: make sure ensemble deserialization
 happens for the non-TRAIN modes too.

PiperOrigin-RevId: 192532297
---
 .../python/estimator/canned/boosted_trees.py  | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 58af59dbb1..0ecc8c7089 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -317,27 +317,28 @@ def _bt_model_fn(
                                                    head.logits_dimension)
 
     # Create Ensemble resources.
-    if is_single_machine:
-      tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
-      local_tree_ensemble = tree_ensemble
-      ensemble_reload = control_flow_ops.no_op()
-    else:
-      tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
-      with ops.device(worker_device):
-        local_tree_ensemble = boosted_trees_ops.TreeEnsemble(
-            name=name + '_local', is_local=True)
-      # TODO(soroush): Do partial updates if this becomes a bottleneck.
-      ensemble_reload = local_tree_ensemble.deserialize(
-          *tree_ensemble.serialize())
-
+    tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
     # Create logits.
     if mode != model_fn.ModeKeys.TRAIN:
       logits = boosted_trees_ops.predict(
-          tree_ensemble_handle=local_tree_ensemble.resource_handle,
+          # For non-TRAIN mode, ensemble doesn't change after initialization,
+          # so no local copy is needed; using tree_ensemble directly.
+          tree_ensemble_handle=tree_ensemble.resource_handle,
           bucketized_features=input_feature_list,
           logits_dimension=head.logits_dimension,
           max_depth=tree_hparams.max_depth)
     else:
+      if is_single_machine:
+        local_tree_ensemble = tree_ensemble
+        ensemble_reload = control_flow_ops.no_op()
+      else:
+        # Have a local copy of ensemble for the distributed setting.
+        with ops.device(worker_device):
+          local_tree_ensemble = boosted_trees_ops.TreeEnsemble(
+              name=name + '_local', is_local=True)
+        # TODO(soroush): Do partial updates if this becomes a bottleneck.
+        ensemble_reload = local_tree_ensemble.deserialize(
+            *tree_ensemble.serialize())
       if cache:
         cached_tree_ids, cached_node_ids, cached_logits = cache.lookup()
       else:
-- 
GitLab


From 7810e47e7d7c90b0e3df8e251964a38ebff9d978 Mon Sep 17 00:00:00 2001
From: Martin Wicke <577277+martinwicke@users.noreply.github.com>
Date: Mon, 16 Apr 2018 11:56:46 -0700
Subject: [PATCH 024/450] Merge pull request #18568 from
 case540/enable_git_tag_override

Add ability to override git tag in __git_version__ string.
---
 tensorflow/tensorflow.bzl              |  2 +-
 tensorflow/tools/git/gen_git_source.py | 37 +++++++++++++++++++++-----
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 528f811b40..b286834ded 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1704,7 +1704,7 @@ def tf_version_info_genrule():
       ],
       outs=["util/version_info.cc"],
       cmd=
-      "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\"",
+      "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\" --git_tag_override=$${GIT_TAG_OVERRIDE:-}",
       local=1,
       tools=[clean_dep("//tensorflow/tools/git:gen_git_source.py")],)
 
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index cbcdbf5b80..db2580755b 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -139,7 +139,7 @@ def configure(src_base_path, gen_path, debug=False):
     print("gen_git_source.py: spec is %r" % spec)
 
 
-def get_git_version(git_base_path):
+def get_git_version(git_base_path, git_tag_override):
   """Get the git version from the repository.
 
   This function runs `git describe ...` in the path given as `git_base_path`.
@@ -152,6 +152,9 @@ def get_git_version(git_base_path):
 
   Args:
     git_base_path: where the .git directory is located
+    git_tag_override: Override the value for the git tag. This is useful for
+      releases where we want to build the release before the git tag is
+      created.
   Returns:
     A bytestring representing the git version
   """
@@ -161,6 +164,14 @@ def get_git_version(git_base_path):
         "git", str("--git-dir=%s/.git" % git_base_path),
         str("--work-tree=" + git_base_path), "describe", "--long", "--tags"
     ]).strip())
+    if git_tag_override:
+      split_val = val.split("-")
+      if len(split_val) != 3:
+        raise Exception(
+            ("Expected git version in format 'TAG-COMMITS AFTER TAG-HASH' "
+             "but got '%s'") % val)
+      split_val[0] = git_tag_override
+      val = bytes("-".join(split_val))
     return val if val else unknown_label
   except subprocess.CalledProcessError:
     return unknown_label
@@ -197,7 +208,7 @@ const int tf_monolithic_build() {
   open(filename, "w").write(contents)
 
 
-def generate(arglist):
+def generate(arglist, git_tag_override=None):
   """Generate version_info.cc as given `destination_file`.
 
   Args:
@@ -217,6 +228,10 @@ def generate(arglist):
   `ref_symlink` is unused in this script but passed, because the build
     system uses that file to detect when commits happen.
 
+    git_tag_override: Override the value for the git tag. This is useful for
+      releases where we want to build the release before the git tag is
+      created.
+
   Raises:
     RuntimeError: If ./configure needs to be run, RuntimeError will be raised.
   """
@@ -234,11 +249,11 @@ def generate(arglist):
       raise RuntimeError(
           "Run ./configure again, branch was '%s' but is now '%s'" %
           (old_branch, new_branch))
-    git_version = get_git_version(data["path"])
+    git_version = get_git_version(data["path"], git_tag_override)
   write_version_info(dest_file, git_version)
 
 
-def raw_generate(output_file):
+def raw_generate(output_file, git_tag_override=None):
   """Simple generator used for cmake/make build systems.
 
   This does not create any symlinks. It requires the build system
@@ -246,9 +261,12 @@ def raw_generate(output_file):
 
   Args:
     output_file: Output filename for the version info cc
+    git_tag_override: Override the value for the git tag. This is useful for
+      releases where we want to build the release before the git tag is
+      created.
   """
 
-  git_version = get_git_version(".")
+  git_version = get_git_version(".", git_tag_override)
   write_version_info(output_file, git_version)
 
 
@@ -270,6 +288,11 @@ parser.add_argument(
     "--gen_root_path", type=str,
     help="Root path to place generated git files (created by --configure).")
 
+parser.add_argument(
+    "--git_tag_override", type=str,
+    help="Override git tag value in the __git_version__ string. Useful when "
+         "creating release builds before the release tag is created.")
+
 parser.add_argument(
     "--generate",
     type=str,
@@ -288,9 +311,9 @@ if args.configure is not None:
     raise RuntimeError("Must pass --gen_root_path arg when running --configure")
   configure(args.configure, args.gen_root_path, debug=args.debug)
 elif args.generate is not None:
-  generate(args.generate)
+  generate(args.generate, args.git_tag_override)
 elif args.raw_generate is not None:
-  raw_generate(args.raw_generate)
+  raw_generate(args.raw_generate, args.git_tag_override)
 else:
   raise RuntimeError("--configure or --generate or --raw_generate "
                      "must be used")
-- 
GitLab


From e9e5356b206e9399b5d06b618fc77f460e9613bf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 28 Mar 2018 10:03:37 -0700
Subject: [PATCH 025/450] Enable the Grappler arithmetic optimizer by default
 in Python tests.

PiperOrigin-RevId: 190787954
---
 tensorflow/python/framework/test_util.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 990fa429a1..bf00fa6439 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -974,8 +974,6 @@ class TensorFlowTestCase(googletest.TestCase):
       config.graph_options.optimizer_options.opt_level = -1
       config.graph_options.rewrite_options.constant_folding = (
           rewriter_config_pb2.RewriterConfig.OFF)
-      config.graph_options.rewrite_options.arithmetic_optimization = (
-          rewriter_config_pb2.RewriterConfig.OFF)
       return config
 
     if graph is None:
-- 
GitLab


From 9e4818375f3853c1a8cdd18fe22d1b1f447cfaef Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 10:30:32 -0700
Subject: [PATCH 026/450] Disable x * x -> square(x) Grapler rewrite for
 complex types unless the op is on CPU. Square is not registered for complex
 types on GPU, and doing so produces a crash in with CUDA_ILLEGAL_INSTRUCTION
 when running it on open source ubuntu.

PiperOrigin-RevId: 192788160
---
 .../optimizers/arithmetic_optimizer.cc        | 21 +++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index fa0f7c1c6e..a8fa4a10cb 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -1732,13 +1732,22 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
 
   if (node->op() == "Mul" && node->input(0) == node->input(1) &&
       !OptimizedNodeExists(*node, "square")) {
-    NodeDef* new_square_node = AddNode(*node, "square", /*copy_node=*/true);
-    new_square_node->set_op("Square");
-    for (int i = 1; i < new_square_node->input_size(); ++i) {
-      new_square_node->set_input(i - 1, new_square_node->input(i));
+    const DataType type = GetDataTypeFromAttr(*node, "T");
+    bool is_complex = (type == DT_COMPLEX64) || (type == DT_COMPLEX128);
+    string dontcare;
+    string device;
+    bool is_on_cpu =
+        DeviceNameUtils::SplitDeviceName(node->device(), &dontcare, &device) &&
+        str_util::StrContains(device, DEVICE_CPU);
+    if (!is_complex || is_on_cpu) {
+      NodeDef* new_square_node = AddNode(*node, "square", /*copy_node=*/true);
+      new_square_node->set_op("Square");
+      for (int i = 1; i < new_square_node->input_size(); ++i) {
+        new_square_node->set_input(i - 1, new_square_node->input(i));
+      }
+      new_square_node->mutable_input()->RemoveLast();
+      return new_square_node->name();
     }
-    new_square_node->mutable_input()->RemoveLast();
-    return new_square_node->name();
   }
 
   if (IsAggregate(*node) && NumNonControlInputs(*node) > 0) {
-- 
GitLab


From b358c9932e0d2f50e50baa5f1a9441e3594244c4 Mon Sep 17 00:00:00 2001
From: Sourabh Bajaj <sourabhbajaj@google.com>
Date: Wed, 11 Apr 2018 15:20:11 -0700
Subject: [PATCH 027/450] GCS Filesystem should not cache checkpoint file as we
 need to read the updated checkpoints from the contents.

PiperOrigin-RevId: 192517819
(cherry picked from commit 079d63d59b75bdfd25f7371efda25ec5f6739b78)
---
 .../core/platform/cloud/gcs_file_system.cc    |  8 ++++
 .../platform/cloud/gcs_file_system_test.cc    | 48 +++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 3c0dc13d75..6ed1d5dad2 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -301,6 +301,14 @@ class GcsRandomAccessFile : public RandomAccessFile {
     TF_RETURN_IF_ERROR(file_block_cache_->Read(filename_, offset, n, scratch,
                                                &bytes_transferred));
     *result = StringPiece(scratch, bytes_transferred);
+    string checkpoint_ending = "/checkpoint";
+    // Check if the file is the checkpoint file as we should not be caching
+    // that. As it's contents are updated and used for iterating checkpoints.
+    if (std::equal(checkpoint_ending.rbegin(), checkpoint_ending.rend(),
+                   filename_.rbegin())) {
+      // Remove the checkpoint file from the cache
+      file_block_cache_->RemoveFile(filename_);
+    }
     if (bytes_transferred < n) {
       // This is not an error per se. The RandomAccessFile interface expects
       // that Read returns OutOfRange if fewer bytes were read than requested.
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 2fbde9b6a7..e9eca04fef 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -198,6 +198,54 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
   EXPECT_EQ("0123", result);
 }
 
+TEST(GcsFileSystemTest, NewRandomAccessFile_CheckpointFile_WithBlockCache) {
+  // Our underlying file in this test changes as new data comes in
+  std::vector<HttpRequest*> requests(
+      {new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/checkpoint\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-8\n"
+           "Timeouts: 5 1 20\n",
+           "012345678"),
+       new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/checkpoint\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-8\n"
+           "Timeouts: 5 1 20\n",
+           "abcdefghi")});
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      9 /* block size */, 18 /* max bytes */, 0 /* max staleness */,
+      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
+
+  char scratch[100];
+  StringPiece result;
+  {
+    // We are instantiating this in an enclosed scope to make sure after the
+    // unique ptr goes out of scope, we can still access result.
+    std::unique_ptr<RandomAccessFile> file;
+    TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/checkpoint", &file));
+
+    // Read the first chunk. The cache will be populated with the first block of
+    // 9 bytes.
+    scratch[5] = 'x';
+    TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
+    EXPECT_EQ("0123", result);
+    EXPECT_EQ(scratch[5], 'x');  // Make sure we only copied 4 bytes.
+
+    // The second chunk should not be in cache so we make a new request
+    // As the checkpoint file should not be cached
+    TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
+    EXPECT_EQ("abcd", result);
+    EXPECT_EQ(scratch[5], 'x');  // Make sure we only copied 4 bytes.
+  }
+}
+
 TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_Flush) {
   // Our underlying file in this test is a 15 byte file with contents
   // "0123456789abcde".
-- 
GitLab


From 2b47b7f374612c34985aad4adedfa8d9a5b2440c Mon Sep 17 00:00:00 2001
From: Junpeng Lao <junpeng.lao@unifr.ch>
Date: Tue, 17 Apr 2018 13:28:02 +0200
Subject: [PATCH 028/450] Unify style in bijector

---
 .../contrib/distributions/python/ops/bijectors/invert.py      | 4 ++--
 .../python/ops/bijectors/masked_autoregressive.py             | 4 ++--
 .../contrib/distributions/python/ops/bijectors/permute.py     | 4 ++--
 .../contrib/distributions/python/ops/bijectors/real_nvp.py    | 4 ++--
 .../contrib/distributions/python/ops/bijectors/reshape.py     | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/invert.py b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
index 1904239a0e..84a3289ba2 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
@@ -18,14 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.ops.distributions import bijector as bijector_lib
+from tensorflow.python.ops.distributions import bijector
 
 __all__ = [
     "Invert",
 ]
 
 
-class Invert(bijector_lib.Bijector):
+class Invert(bijector.Bijector):
   """Bijector which inverts another Bijector.
 
   Example Use: [ExpGammaDistribution (see Background & Context)](
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
index ef56cf6ddd..83667b0e80 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
@@ -32,7 +32,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import template as template_ops
 from tensorflow.python.ops import variable_scope as variable_scope_lib
-from tensorflow.python.ops.distributions import bijector as bijector_lib
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
@@ -42,7 +42,7 @@ __all__ = [
 ]
 
 
-class MaskedAutoregressiveFlow(bijector_lib.Bijector):
+class MaskedAutoregressiveFlow(bijector.Bijector):
   """Affine MaskedAutoregressiveFlow bijector for vector-valued events.
 
   The affine autoregressive flow [(Papamakarios et al., 2016)][3] provides a
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/permute.py b/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
index 4978167803..12a16a3f2b 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
@@ -28,7 +28,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops.distributions import bijector as bijector_lib
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
@@ -36,7 +36,7 @@ __all__ = [
 ]
 
 
-class Permute(bijector_lib.Bijector):
+class Permute(bijector.Bijector):
   """Permutes the rightmost dimension of a `Tensor`.
 
   ```python
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py b/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
index f09ab21bce..66e8a5b9b3 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
@@ -25,7 +25,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import template as template_ops
-from tensorflow.python.ops.distributions import bijector as bijector_lib
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
@@ -34,7 +34,7 @@ __all__ = [
 ]
 
 
-class RealNVP(bijector_lib.Bijector):
+class RealNVP(bijector.Bijector):
   """RealNVP "affine coupling layer" for vector-valued events.
 
   Real NVP models a normalizing flow on a `D`-dimensional distribution via a
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py b/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
index f21b982ba6..5497c422e4 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
@@ -28,7 +28,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector as bijector_lib
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
@@ -44,7 +44,7 @@ def _ndims_from_shape(shape):
   return array_ops.shape(shape)[0]
 
 
-class Reshape(bijector_lib.Bijector):
+class Reshape(bijector.Bijector):
   """Reshapes the `event_shape` of a `Tensor`.
 
   The semantics generally follow that of `tf.reshape()`, with
-- 
GitLab


From 9620211c64e95818d59ad6991059a4a66b6a064d Mon Sep 17 00:00:00 2001
From: Junpeng Lao <junpeng.lao@unifr.ch>
Date: Tue, 17 Apr 2018 17:19:16 +0200
Subject: [PATCH 029/450] minor format clean up

---
 .../contrib/distributions/python/ops/bijectors/weibull.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py b/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
index 39129cd22c..a22560fe80 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
@@ -128,7 +128,7 @@ class Weibull(bijector.Bijector):
       return x
     is_valid = check_ops.assert_non_negative(
         x,
-        message="Forward transformation input must be at least {}.".format(0))
+        message="Forward transformation input must be at least 0.")
     return control_flow_ops.with_dependencies([is_valid], x)
 
   def _maybe_assert_valid_y(self, y):
-- 
GitLab


From fe1753af198dbfc64f7ab623865dd91cbdda8eeb Mon Sep 17 00:00:00 2001
From: Junpeng Lao <junpeng.lao@unifr.ch>
Date: Tue, 17 Apr 2018 18:51:01 +0200
Subject: [PATCH 030/450] WIP implemented Ordered bijector

---
 .../kernel_tests/bijectors/ordered_test.py    | 111 +++++++++++++++++
 .../python/ops/bijectors/__init__.py          |   2 +
 .../python/ops/bijectors/ordered.py           | 114 ++++++++++++++++++
 3 files changed, 227 insertions(+)
 create mode 100644 tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
 create mode 100644 tensorflow/contrib/distributions/python/ops/bijectors/ordered.py

diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
new file mode 100644
index 0000000000..1bcbfed6c3
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
@@ -0,0 +1,111 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops.bijectors.ordered import Ordered
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.platform import test
+
+
+rng = np.random.RandomState(42)
+
+
+class OrderedBijectorTest(test.TestCase):
+  """Tests correctness of the ordered transformation."""
+
+  def testBijectorVector(self):
+    with self.test_session():
+      ordered = Ordered()
+      self.assertEqual("ordered", ordered.name)
+      x = np.log([[2., 3, 4], [4., 8, 12]])
+      y = [[0.2, 0.3, 0.4, 0.1], [0.16, 0.32, 0.48, 0.04]]
+      self.assertAllClose(y, ordered.forward(x).eval())
+      self.assertAllClose(x, ordered.inverse(y).eval())
+      self.assertAllClose(
+          -np.sum(np.log(y), axis=1),
+          ordered.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          atol=0.,
+          rtol=1e-7)
+      self.assertAllClose(
+          -ordered.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          ordered.forward_log_det_jacobian(x, event_ndims=1).eval(),
+          atol=0.,
+          rtol=1e-7)
+
+  def testBijectorUnknownShape(self):
+    with self.test_session():
+      ordered = Ordered()
+      self.assertEqual("ordered", ordered.name)
+      x = array_ops.placeholder(shape=[2, None], dtype=dtypes.float32)
+      real_x = np.log([[2., 3, 4], [4., 8, 12]])
+      y = array_ops.placeholder(shape=[2, None], dtype=dtypes.float32)
+      real_y = [[0.2, 0.3, 0.4, 0.1], [0.16, 0.32, 0.48, 0.04]]
+      self.assertAllClose(real_y, ordered.forward(x).eval(
+          feed_dict={x: real_x}))
+      self.assertAllClose(real_x, ordered.inverse(y).eval(
+          feed_dict={y: real_y}))
+      self.assertAllClose(
+          -np.sum(np.log(real_y), axis=1),
+          ordered.inverse_log_det_jacobian(y, event_ndims=1).eval(
+              feed_dict={y: real_y}),
+          atol=0.,
+          rtol=1e-7)
+      self.assertAllClose(
+          -ordered.inverse_log_det_jacobian(y, event_ndims=1).eval(
+              feed_dict={y: real_y}),
+          ordered.forward_log_det_jacobian(x, event_ndims=1).eval(
+              feed_dict={x: real_x}),
+          atol=0.,
+          rtol=1e-7)
+
+  def testShapeGetters(self):
+    with self.test_session():
+      x = tensor_shape.TensorShape([4])
+      y = tensor_shape.TensorShape([5])
+      bijector = Ordered(validate_args=True)
+      self.assertAllEqual(y, bijector.forward_event_shape(x))
+      self.assertAllEqual(y.as_list(),
+                          bijector.forward_event_shape_tensor(
+                              x.as_list()).eval())
+      self.assertAllEqual(x, bijector.inverse_event_shape(y))
+      self.assertAllEqual(x.as_list(),
+                          bijector.inverse_event_shape_tensor(
+                              y.as_list()).eval())
+
+  def testBijectiveAndFinite(self):
+    with self.test_session():
+      ordered = Ordered()
+      x = np.linspace(-50, 50, num=10).reshape(5, 2).astype(np.float32)
+      # Make y values on the simplex with a wide range.
+      y_0 = np.ones(5).astype(np.float32)
+      y_1 = (1e-5 * rng.rand(5)).astype(np.float32)
+      y_2 = (1e1 * rng.rand(5)).astype(np.float32)
+      y = np.array([y_0, y_1, y_2])
+      y /= y.sum(axis=0)
+      y = y.T  # y.shape = [5, 3]
+      assert_bijective_and_finite(ordered, x, y, event_ndims=1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
index babce80396..51478dbeff 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
@@ -30,6 +30,7 @@
 @@Invert
 @@Kumaraswamy
 @@MaskedAutoregressiveFlow
+@@Ordered
 @@Permute
 @@PowerTransform
 @@RealNVP
@@ -67,6 +68,7 @@ from tensorflow.contrib.distributions.python.ops.bijectors.inline import *
 from tensorflow.contrib.distributions.python.ops.bijectors.invert import *
 from tensorflow.contrib.distributions.python.ops.bijectors.kumaraswamy import *
 from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive import *
+from tensorflow.contrib.distributions.python.ops.bijectors.ordered import *
 from tensorflow.contrib.distributions.python.ops.bijectors.permute import *
 from tensorflow.contrib.distributions.python.ops.bijectors.power_transform import *
 from tensorflow.contrib.distributions.python.ops.bijectors.real_nvp import *
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
new file mode 100644
index 0000000000..ec8f660144
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
@@ -0,0 +1,114 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ordered bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector
+
+
+__all__ = [
+    "Ordered",
+]
+
+
+class Ordered(bijector.Bijector):
+  """Bijector which maps a tensor x_k that has increasing elements in the last
+  dimension to an unconstrained tensor y_k.
+
+  On the last dimension of the tensor, Ordered bijector performs:
+  `y[0] = x[0]`
+  `y[1:] = math_ops.log(x[1:] - x[:-1])`
+
+  Example Use:
+
+  ```python
+  bijector.Ordered().forward(tf.log([2, 3, 4]))
+  # Result: [0.6931472, 3.6931472, 7.693147]
+
+  bijector.Ordered().inverse([0.2, 0.3, 0.4])
+  # Result: tf.log([2, 3, 4])
+  ```
+  """
+
+  def __init__(self,
+               validate_args=False,
+               name="ordered"):
+    self._graph_parents = []
+    self._name = name
+    super(Ordered, self).__init__(
+        forward_min_event_ndims=1,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward_event_shape(self, input_shape):
+    if input_shape.ndims is None or input_shape[-1] is None:
+      return input_shape
+    return tensor_shape.TensorShape([input_shape[-1]])
+
+  def _forward_event_shape_tensor(self, input_shape):
+    return (input_shape[-1])[..., array_ops.newaxis]
+
+  def _inverse_event_shape(self, output_shape):
+    if output_shape.ndims is None or output_shape[-1] is None:
+      return output_shape
+    if output_shape[-1] <= 1:
+      raise ValueError("output_shape[-1] = %d <= 1" % output_shape[-1])
+    return tensor_shape.TensorShape([output_shape[-1]])
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    if self.validate_args:
+      # It is not possible for a negative shape so we need only check <= 1.
+      is_greater_one = check_ops.assert_greater(
+          output_shape[-1], 1, message="Need last dimension greater than 1.")
+      output_shape = control_flow_ops.with_dependencies(
+          [is_greater_one], output_shape)
+    return (output_shape[-1])[..., array_ops.newaxis]
+
+  def _forward(self, x):
+    x = self._maybe_assert_valid_x(x)
+    y0 = array_ops.expand_dims(x[..., 0], -1)
+    yk = math_ops.log(x[..., 1:] - x[..., :-1])
+    y = array_ops.concat([y0, yk], axis=-1)
+    return y
+
+  def _inverse(self, y):
+    x0 = array_ops.expand_dims(y[..., 0], -1)
+    xk = math_ops.exp(y[..., 1:])
+    x = array_ops.concat([x0, xk], axis=-1)
+    return math_ops.cumsum(x, axis=-1)
+
+  def _inverse_log_det_jacobian(self, y):
+    return math_ops.reduce_sum(y[..., 1:], axis=-1)
+
+  def _forward_log_det_jacobian(self, x):
+    pass
+
+  def _maybe_assert_valid_x(self, x):
+    if not self.validate_args:
+      return x
+    is_valid = check_ops.is_strictly_increasing(
+        x,
+        message="Forward transformation input must be strictly increasing.")
+    return control_flow_ops.with_dependencies([is_valid], x)
\ No newline at end of file
-- 
GitLab


From 105d9795ae692ed2486652e5d672825ccbd726e9 Mon Sep 17 00:00:00 2001
From: Noah Eisen <ncteisen@google.com>
Date: Tue, 17 Apr 2018 12:21:10 -0700
Subject: [PATCH 031/450] Removes another custom implementation of
 ZeroCopyInputStream, instead uses the now public gRPC implementation.

Also, moves GrpcByteSource to grpc_util, to keep it near the other serialization code.

Lastly, gives a more verbose error if serialization (unparsing) fails (which should not ever happen).

PiperOrigin-RevId: 193236893
---
 tensorflow/core/distributed_runtime/rpc/BUILD |   1 +
 .../core/distributed_runtime/rpc/grpc_state.h |   8 +-
 .../core/distributed_runtime/rpc/grpc_util.cc | 107 +++---------------
 .../core/distributed_runtime/rpc/grpc_util.h  |  58 ++++++----
 .../distributed_runtime/rpc/grpc_util_test.cc |  12 +-
 .../rpc/grpc_worker_service_impl.h            |  28 +----
 6 files changed, 65 insertions(+), 149 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index fa0f8c9b52..e973a22f45 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -189,6 +189,7 @@ cc_library(
     srcs = ["grpc_worker_service_impl.cc"],
     hdrs = ["grpc_worker_service_impl.h"],
     deps = [
+        ":grpc_util",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/distributed_runtime:tensor_coding",
         "@grpc//:grpc++_unsecure",
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index 0b6f9474dd..59dbb7ae04 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -56,7 +56,11 @@ class RPCState : public GrpcClientCQTag {
     }
 
     response_ = response;
-    GrpcMaybeUnparseProto(request, &request_buf_);
+    ::grpc::Status s = GrpcMaybeUnparseProto(request, &request_buf_);
+    if (!s.ok()) {
+      LOG(ERROR) << "GrpcMaybeUnparseProto returned with non-ok status: "
+                 << s.error_message();
+    }
     call_ =
         std::move(stub->PrepareUnaryCall(&context_, method, request_buf_, cq));
     call_->StartCall();
@@ -73,7 +77,7 @@ class RPCState : public GrpcClientCQTag {
       // to Finish for client-side unary calls, ok should never be false
       s.Update(errors::Internal("unexpected ok value at rpc completion"));
     }
-    if (s.ok() && !GrpcMaybeParseProto(response_buf_, response_)) {
+    if (s.ok() && !GrpcMaybeParseProto(&response_buf_, response_)) {
       s.Update(errors::Internal("could not parse rpc response"));
     }
     if (!s.ok()) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
index c80728544b..ece56a2727 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
@@ -18,115 +18,42 @@ limitations under the License.
 
 namespace tensorflow {
 
-GrpcByteBufferSource::GrpcByteBufferSource() {}
-
-bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) {
-  cur_ = -1;
-  left_ = 0;
-  ptr_ = nullptr;
-  byte_count_ = 0;
-  bool ok = src.Dump(&slices_).ok();
-  if (!ok) {
-    slices_.clear();
-  }
-  return ok;
-}
-
-bool GrpcByteBufferSource::Next(const void** data, int* size) {
-  // Use loop instead of if in case buffer contained empty slices.
-  while (left_ == 0) {
-    // Advance to next slice.
-    cur_++;
-    if (cur_ >= slices_.size()) {
-      return false;
-    }
-    const ::grpc::Slice& s = slices_[cur_];
-    left_ = s.size();
-    ptr_ = reinterpret_cast<const char*>(s.begin());
-  }
-
-  *data = ptr_;
-  *size = left_;
-  byte_count_ += left_;
-  ptr_ += left_;
-  left_ = 0;
-  return true;
-}
-
-void GrpcByteBufferSource::BackUp(int count) {
-  ptr_ -= count;
-  left_ += count;
-  byte_count_ -= count;
-}
-
-bool GrpcByteBufferSource::Skip(int count) {
-  const void* data;
-  int size;
-  while (Next(&data, &size)) {
-    if (size >= count) {
-      BackUp(size - count);
-      return true;
-    }
-    // size < count;
-    count -= size;
-  }
-  // error or we have too large count;
-  return false;
-}
-
-grpc::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
-  return byte_count_;
-}
-
-void GrpcMaybeUnparseProto(const protobuf::Message& src,
-                           grpc::ByteBuffer* dst) {
-  // TODO(sanjay): For bigger protos, serialize into a ZeroCopyOutputStream.
-  ::grpc::Slice s(src.ByteSizeLong());
-  src.SerializeWithCachedSizesToArray(
-      const_cast<uint8*>(reinterpret_cast<const uint8*>(s.begin())));
-  ::grpc::ByteBuffer buffer(&s, 1);
-  dst->Swap(&buffer);
+::grpc::Status GrpcMaybeUnparseProto(const protobuf::Message& src,
+                                     grpc::ByteBuffer* dst) {
+  bool own_buffer;
+  return ::grpc::GenericSerialize<::grpc::ProtoBufferWriter, protobuf::Message>(
+      src, dst, &own_buffer);
 }
 
 // GrpcMaybeUnparseProto from a string simply copies the string to the
 // ByteBuffer.
-void GrpcMaybeUnparseProto(const string& src, grpc::ByteBuffer* dst) {
+::grpc::Status GrpcMaybeUnparseProto(const string& src, grpc::ByteBuffer* dst) {
   ::grpc::Slice s(src.data(), src.size());
   ::grpc::ByteBuffer buffer(&s, 1);
   dst->Swap(&buffer);
+  return ::grpc::Status::OK;
 }
 
-bool GrpcMaybeParseProto(const grpc::ByteBuffer& src, protobuf::Message* dst) {
-  GrpcByteBufferSource stream;
-  if (!stream.Init(src)) return false;
-  return dst->ParseFromZeroCopyStream(&stream);
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, protobuf::Message* dst) {
+  grpc::ProtoBufferReader reader(src);
+  return dst->ParseFromZeroCopyStream(&reader);
 }
 
 // Overload of GrpcParseProto so we can decode a TensorResponse without
 // extra copying.  This overload is used by the RPCState class in
 // grpc_state.h.
-bool GrpcMaybeParseProto(const ::grpc::ByteBuffer& src, TensorResponse* dst) {
-  struct ByteSource : public TensorResponse::Source {
-    const ::grpc::ByteBuffer* buffer;
-    GrpcByteBufferSource src;
-    bool ok;
-
-    ::tensorflow::protobuf::io::ZeroCopyInputStream* contents() override {
-      ok = src.Init(*buffer);
-      return &src;
-    }
-  };
-  ByteSource bs;
-  bs.buffer = &src;
-  return dst->ParseFrom(&bs).ok() && bs.ok;
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, TensorResponse* dst) {
+  ::tensorflow::GrpcByteSource byte_source(src);
+  auto s = dst->ParseFrom(&byte_source);
+  return s.ok();
 }
 
 // GrpcMaybeParseProto into a string simply copies bytes into the string.
-bool GrpcMaybeParseProto(const grpc::ByteBuffer& src, string* dst) {
+bool GrpcMaybeParseProto(grpc::ByteBuffer* src, string* dst) {
   dst->clear();
-  dst->reserve(src.Length());
+  dst->reserve(src->Length());
   std::vector<::grpc::Slice> slices;
-  if (!src.Dump(&slices).ok()) {
+  if (!src->Dump(&slices).ok()) {
     return false;
   }
   for (const ::grpc::Slice& s : slices) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.h b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
index d5e7e9f5b3..4b58781b54 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
@@ -29,6 +29,33 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Thin wrapper around ::grpc::ProtoBufferReader to give TensorResponse an
+// efficient byte reader from which to decode a RecvTensorResponse.
+class GrpcByteSource : public TensorResponse::Source {
+ public:
+  explicit GrpcByteSource(::grpc::ByteBuffer* buffer) : buffer_(buffer) {}
+  ~GrpcByteSource() override { DeleteStream(); }
+
+  typedef ::grpc::ProtoBufferReader Reader;
+
+  protobuf::io::ZeroCopyInputStream* contents() override {
+    DeleteStream();
+    stream_ = new (&space_) Reader(buffer_);
+    return stream_;
+  }
+
+ private:
+  void DeleteStream() {
+    if (stream_) {
+      stream_->~Reader();
+    }
+  }
+
+  ::grpc::ByteBuffer* buffer_;  // Not owned
+  Reader* stream_ = nullptr;    // Points into space_ if non-nullptr
+  char space_[sizeof(Reader)];
+};
+
 constexpr char kStreamRemovedMessage[] = "Stream removed";
 
 // Identify if the given grpc::Status corresponds to an HTTP stream removed
@@ -79,38 +106,21 @@ typedef std::shared_ptr<::grpc::Channel> SharedGrpcChannelPtr;
 inline string GrpcIdKey() { return "tf-rpc"; }
 
 // Serialize src and store in *dst.
-void GrpcMaybeUnparseProto(const protobuf::Message& src,
-                           ::grpc::ByteBuffer* dst);
+::grpc::Status GrpcMaybeUnparseProto(const protobuf::Message& src,
+                                     ::grpc::ByteBuffer* dst);
 
 // Parse contents of src and initialize *dst with them.
-bool GrpcMaybeParseProto(const ::grpc::ByteBuffer& src, protobuf::Message* dst);
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, protobuf::Message* dst);
 
 // Specialization for TensorResponse
-bool GrpcMaybeParseProto(const ::grpc::ByteBuffer& src, TensorResponse* dst);
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, TensorResponse* dst);
 
 // Copy string src to grpc buffer *dst.
-void GrpcMaybeUnparseProto(const string& src, ::grpc::ByteBuffer* dst);
+::grpc::Status GrpcMaybeUnparseProto(const string& src,
+                                     ::grpc::ByteBuffer* dst);
 
 // Copy grpc buffer src to string *dst.
-bool GrpcMaybeParseProto(const ::grpc::ByteBuffer& src, string* dst);
-
-// A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
-class GrpcByteBufferSource : public ::grpc::protobuf::io::ZeroCopyInputStream {
- public:
-  GrpcByteBufferSource();
-  bool Init(const ::grpc::ByteBuffer& src);  // Can be called multiple times.
-  bool Next(const void** data, int* size) override;
-  void BackUp(int count) override;
-  bool Skip(int count) override;
-  ::grpc::protobuf::int64 ByteCount() const override;
-
- private:
-  std::vector<::grpc::Slice> slices_;
-  int cur_;          // Current slice index.
-  int left_;         // Number of bytes in slices_[cur_] left to yield.
-  const char* ptr_;  // Address of next byte in slices_[cur_] to yield.
-  ::grpc::protobuf::int64 byte_count_;
-};
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, string* dst);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc
index 5356fb36e4..6eaa0b1833 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc
@@ -67,7 +67,7 @@ TEST(GrpcProto, Unparse) {
   proto.add_container("hello");
   proto.add_container("world");
   grpc::ByteBuffer buf;
-  GrpcMaybeUnparseProto(proto, &buf);
+  ASSERT_TRUE(GrpcMaybeUnparseProto(proto, &buf).ok());
   CleanupAllRequest parsed;
   ASSERT_TRUE(parsed.ParseFromString(ToString(buf)));
   ASSERT_EQ(proto.DebugString(), parsed.DebugString());
@@ -80,7 +80,7 @@ TEST(GrpcProto, UnparseToString) {
   string str;
   CHECK(proto.SerializeToString(&str));
   grpc::ByteBuffer buf;
-  GrpcMaybeUnparseProto(str, &buf);
+  ASSERT_TRUE(GrpcMaybeUnparseProto(str, &buf).ok());
   CleanupAllRequest parsed;
   ASSERT_TRUE(parsed.ParseFromString(ToString(buf)));
   ASSERT_EQ(proto.DebugString(), parsed.DebugString());
@@ -103,7 +103,7 @@ TEST(GrpcProto, Parse) {
     CleanupAllRequest proto = MakeProto(c.length);
     ::grpc::ByteBuffer src = MakeBuffer(proto.SerializeAsString(), c.slices);
     CleanupAllRequest parsed;
-    ASSERT_TRUE(GrpcMaybeParseProto(src, &parsed))
+    ASSERT_TRUE(GrpcMaybeParseProto(&src, &parsed))
         << c.length << " " << c.slices;
     ASSERT_EQ(proto.DebugString(), parsed.DebugString());
   }
@@ -127,7 +127,7 @@ TEST(GrpcProto, ParseFromString) {
     ::grpc::ByteBuffer src = MakeBuffer(proto.SerializeAsString(), c.slices);
     string parsed_str;
     CleanupAllRequest parsed;
-    ASSERT_TRUE(GrpcMaybeParseProto(src, &parsed_str))
+    ASSERT_TRUE(GrpcMaybeParseProto(&src, &parsed_str))
         << c.length << " " << c.slices;
     ASSERT_TRUE(parsed.ParseFromString(parsed_str));
     ASSERT_EQ(proto.DebugString(), parsed.DebugString());
@@ -140,7 +140,7 @@ static void BM_UnparseGrpc(int iters, int size) {
   testing::StartTiming();
   for (int i = 0; i < iters; i++) {
     grpc::ByteBuffer buf;
-    GrpcMaybeUnparseProto(proto, &buf);
+    CHECK(GrpcMaybeUnparseProto(proto, &buf).ok());
   }
   testing::StopTiming();
 }
@@ -167,7 +167,7 @@ static void BM_ParseGrpc(int iters, int size, int num_slices) {
   testing::StartTiming();
 
   for (int i = 0; i < iters; i++) {
-    CHECK(GrpcMaybeParseProto(buf, &proto));
+    CHECK(GrpcMaybeParseProto(&buf, &proto));
   }
 
   testing::StopTiming();
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index 0abac4f3c7..a54ea93796 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -26,36 +26,10 @@ limitations under the License.
 #include "grpc++/impl/codegen/sync_stream.h"
 #include "grpc++/support/byte_buffer.h"
 
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 
-namespace tensorflow {
-class GrpcByteSource : public TensorResponse::Source {
- public:
-  explicit GrpcByteSource(::grpc::ByteBuffer* buffer) : buffer_(buffer) {}
-  ~GrpcByteSource() override { DeleteStream(); }
-
-  typedef ::grpc::ProtoBufferReader Reader;
-
-  protobuf::io::ZeroCopyInputStream* contents() override {
-    DeleteStream();
-    stream_ = new (&space_) Reader(buffer_);
-    return stream_;
-  }
-
- private:
-  void DeleteStream() {
-    if (stream_) {
-      stream_->~Reader();
-    }
-  }
-
-  ::grpc::ByteBuffer* buffer_;  // Not owned
-  Reader* stream_ = nullptr;  // Points into space_ if non-nullptr
-  char space_[sizeof(Reader)];
-};
-}  // namespace tensorflow
-
 namespace grpc {
 class CompletionQueue;
 class Channel;
-- 
GitLab


From 91be39b2bae2d935fb9eb8c9a7cd1d09642784af Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 12:31:36 -0700
Subject: [PATCH 032/450] Relaxes the type constraints for the features in
 ServingInputReceiver, so it will accept anything convertible to a Tensor or
 SparseTensor.

This makes it possible to use with tf.contrib.labeled_tensor.

PiperOrigin-RevId: 193238295
---
 tensorflow/python/estimator/export/export.py  | 14 ++++++++++++-
 .../python/estimator/export/export_test.py    | 20 +++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
index 9206a4964b..41c1f5a2e2 100644
--- a/tensorflow/python/estimator/export/export.py
+++ b/tensorflow/python/estimator/export/export.py
@@ -74,8 +74,20 @@ class ServingInputReceiver(collections.namedtuple(
         raise ValueError('feature keys must be strings: {}.'.format(name))
       if not (isinstance(tensor, ops.Tensor)
               or isinstance(tensor, sparse_tensor.SparseTensor)):
-        raise ValueError(
+        value_error = ValueError(
             'feature {} must be a Tensor or SparseTensor.'.format(name))
+        # NOTE(ericmc): This if-else block is a specific carve-out for
+        # LabeledTensor, which has a `.tensor` attribute and which is
+        # convertible to tf.Tensor via ops.convert_to_tensor.
+        # Allowing all types convertible to tf.Tensor is considered by soergel@
+        # to be too permissive.
+        if hasattr(tensor, 'tensor'):
+          try:
+            ops.convert_to_tensor(tensor)
+          except TypeError:
+            raise value_error
+        else:
+          raise value_error
 
     if receiver_tensors is None:
       raise ValueError('receiver_tensors must be defined.')
diff --git a/tensorflow/python/estimator/export/export_test.py b/tensorflow/python/estimator/export/export_test.py
index eb9688bc97..c203be7dac 100644
--- a/tensorflow/python/estimator/export/export_test.py
+++ b/tensorflow/python/estimator/export/export_test.py
@@ -39,6 +39,21 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 
 
+class LabeledTensorMock(object):
+  """Mock class emulating LabeledTensor."""
+
+  def __init__(self):
+    self.tensor = constant_op.constant([1])
+
+
+def _convert_labeled_tensor_mock_to_tensor(value, *args, **kwargs):
+  return ops.internal_convert_to_tensor(value.tensor, *args, **kwargs)
+
+
+ops.register_tensor_conversion_function(LabeledTensorMock,
+                                        _convert_labeled_tensor_mock_to_tensor)
+
+
 class ExportTest(test_util.TensorFlowTestCase):
 
   def test_serving_input_receiver_constructor(self):
@@ -135,6 +150,11 @@ class ExportTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       _ = export.ServingInputReceiver(feature, receiver_tensor)
 
+  def test_feature_labeled_tensor(self):
+    feature = LabeledTensorMock()
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    _ = export.ServingInputReceiver(feature, receiver_tensor)
+
   def test_receiver_wrong_type(self):
     feature = constant_op.constant(5)
     receiver_tensor = "not a tensor"
-- 
GitLab


From 4d2de472999653bb7000be47959b1c5b996d6496 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 12:31:42 -0700
Subject: [PATCH 033/450] Fix the test gensym to prevent creating duplicate
 names in the same test.

PiperOrigin-RevId: 193238314
---
 .../contrib/autograph/converters/converter_test_base.py  | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/autograph/converters/converter_test_base.py b/tensorflow/contrib/autograph/converters/converter_test_base.py
index 23b61cf781..41c2e71702 100644
--- a/tensorflow/contrib/autograph/converters/converter_test_base.py
+++ b/tensorflow/contrib/autograph/converters/converter_test_base.py
@@ -35,14 +35,17 @@ from tensorflow.python.platform import test
 
 
 class FakeNamer(object):
+  """A fake namer that uses a global counter to generate unique names."""
+
+  def __init__(self):
+    self.i = 0
 
   def new_symbol(self, name_root, used):
-    i = 0
     while True:
-      name = '%s%d' % (name_root, i)
+      self.i += 1
+      name = '%s%d' % (name_root, self.i)
       if name not in used:
         return name
-      i += 1
 
   def compiled_function_name(self,
                              original_fqn,
-- 
GitLab


From 2fe299f39785611e29a5fb0d859cd283b3f9587c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 12:36:20 -0700
Subject: [PATCH 034/450] [XLA] Fix arguments to IsSparseArray and IsDenseArray

PiperOrigin-RevId: 193238920
---
 .../compiler/xla/service/pattern_matcher.h    |  4 +--
 .../xla/service/pattern_matcher_test.cc       | 32 ++++++++++++++++++-
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index 5d49638077..f5a4f2c9df 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -532,7 +532,7 @@ class ShapePattern {
       ShapeType,
       ShapePatternLayoutImpl<Impl, const ::xla::Layout,
                              LayoutPatternFormatImpl<LayoutPatternBaseImpl>>>
-  IsDenseArray(const ::xla::Layout* layout) const {
+  IsDenseArray() const {
     return WithLayout(Layout().WithDenseFormat());
   }
 
@@ -540,7 +540,7 @@ class ShapePattern {
       ShapeType,
       ShapePatternLayoutImpl<Impl, const ::xla::Layout,
                              LayoutPatternFormatImpl<LayoutPatternBaseImpl>>>
-  IsSparseArray(const ::xla::Layout* layout) const {
+  IsSparseArray() const {
     return WithLayout(Layout().WithSparseFormat());
   }
 
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
index 5291b1437a..c88157c312 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher_test.cc
+++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
@@ -67,6 +67,7 @@ TEST(PatternMatcherTest, ScalarShape) {
   EXPECT_TRUE(Match(&scalar_shape, match::Shape(&matched_shape).IsScalar()));
   EXPECT_EQ(matched_shape, &scalar_shape);
   EXPECT_TRUE(Match(&scalar_shape, match::Shape().IsArray()));
+  EXPECT_TRUE(Match(&scalar_shape, match::Shape().IsDenseArray()));
   EXPECT_FALSE(Match(&scalar_shape, match::Shape().IsTuple()));
   EXPECT_TRUE(Match(&scalar_shape, match::Shape().WithElementType(F32)));
   EXPECT_TRUE(Match(&scalar_shape, match::Shape().WithRank(0)));
@@ -75,11 +76,13 @@ TEST(PatternMatcherTest, ScalarShape) {
       match::Shape().WithSubshape({0}, match::Shape()).WithElementType(F32)));
 }
 
-TEST(PatternMatcherTest, ArrayShape) {
+TEST(PatternMatcherTest, DenseArrayShape) {
   auto array_shape = ShapeUtil::MakeShape(F32, {2, 3, 4});
   Shape* matched_shape;
   EXPECT_TRUE(Match(&array_shape, match::Shape(&matched_shape).IsArray()));
   EXPECT_EQ(matched_shape, &array_shape);
+  EXPECT_TRUE(Match(&array_shape, match::Shape().IsDenseArray()));
+  EXPECT_FALSE(Match(&array_shape, match::Shape().IsSparseArray()));
   EXPECT_FALSE(Match(&array_shape, match::Shape().IsScalar()));
   EXPECT_FALSE(Match(&array_shape, match::Shape().IsTuple()));
   EXPECT_TRUE(Match(&array_shape, match::Shape().WithElementType(F32)));
@@ -90,6 +93,33 @@ TEST(PatternMatcherTest, ArrayShape) {
   EXPECT_FALSE(Match(&array_shape,
                      match::Shape().WithLayout(
                          match::Layout(&matched_layout).WithSparseFormat())));
+  EXPECT_TRUE(Match(&array_shape,
+                    match::Shape().WithLayout(
+                        match::Layout(&matched_layout).WithDenseFormat())));
+  EXPECT_EQ(matched_layout, &array_shape.layout());
+}
+
+TEST(PatternMatcherTest, SparseArrayShape) {
+  auto array_shape = ShapeUtil::MakeShapeWithSparseLayout(F32, {2, 3, 4}, 10);
+  Shape* matched_shape;
+  EXPECT_TRUE(Match(&array_shape, match::Shape(&matched_shape).IsArray()));
+  EXPECT_EQ(matched_shape, &array_shape);
+  EXPECT_FALSE(Match(&array_shape, match::Shape().IsDenseArray()));
+  EXPECT_TRUE(Match(&array_shape, match::Shape().IsSparseArray()));
+  EXPECT_FALSE(Match(&array_shape, match::Shape().IsScalar()));
+  EXPECT_FALSE(Match(&array_shape, match::Shape().IsTuple()));
+  EXPECT_TRUE(Match(&array_shape, match::Shape().WithElementType(F32)));
+  EXPECT_TRUE(Match(&array_shape, match::Shape().WithRank(3)));
+  EXPECT_FALSE(
+      Match(&array_shape, match::Shape().WithSubshape({0}, match::Shape())));
+  Layout* matched_layout;
+  EXPECT_FALSE(Match(&array_shape,
+                     match::Shape().WithLayout(
+                         match::Layout(&matched_layout).WithDenseFormat())));
+  EXPECT_TRUE(Match(&array_shape,
+                    match::Shape().WithLayout(
+                        match::Layout(&matched_layout).WithSparseFormat())));
+  EXPECT_EQ(matched_layout, &array_shape.layout());
 }
 
 TEST(PatternMatcherTest, TupleShape) {
-- 
GitLab


From 33d55d7caff2bd32fa2b1c5cacb7ac251c48e27d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 12:52:34 -0700
Subject: [PATCH 035/450] Cache the ag_internal module, to avoid falsely
 rejecting it when in the namespace of a previously converted function.
 Explicitly reject lambda functions, for now, becasue they require special
 treatment.

PiperOrigin-RevId: 193241279
---
 .../contrib/autograph/impl/conversion.py      | 35 +++++++++++++------
 .../contrib/autograph/impl/conversion_test.py | 22 ++++++++++++
 2 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/autograph/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py
index 11bbe7888a..5653e991f6 100644
--- a/tensorflow/contrib/autograph/impl/conversion.py
+++ b/tensorflow/contrib/autograph/impl/conversion.py
@@ -154,7 +154,16 @@ def entity_to_graph(o, conversion_map, arg_values, arg_types):
   if tf_inspect.isclass(o):
     node, name, ns = class_to_graph(o, conversion_map)
   elif tf_inspect.isfunction(o):
-    node, name, ns = function_to_graph(o, conversion_map, arg_values, arg_types)
+    # TODO(mdan): This is not a reliable mechanism.
+    # The most reliable way is to check the source code, the AST will contain
+    # a Lambda node instead of a FunctionDef
+    if o.__name__ == '<lambda>':
+      raise NotImplementedError(
+          'lambda functions are not yet supported; declare the function'
+          ' using def instead: %s' % o)
+    else:
+      node, name, ns = function_to_graph(o, conversion_map, arg_values,
+                                         arg_types)
   elif tf_inspect.ismethod(o):
     node, name, ns = function_to_graph(o, conversion_map, arg_values, arg_types)
   else:
@@ -222,16 +231,22 @@ def _add_reserved_symbol(namespace, name, entity):
     raise ValueError('The name "%s" is reserved and may not be used.' % name)
 
 
+ag_internal = None
+
+
 def _add_self_references(namespace, api_module):
-  # Craft a module that exposes parts of the external API as well as certain
-  # internal modules.
-  ag_internal = imp.new_module('autograph')
-  ag_internal.converted_call = api_module.converted_call
-  ag_internal.utils = utils
-  # TODO(mdan): Add safeguards against name clashes.
-  # We don't want to create a submodule because we want the operators to be
-  # accessible as ag__.<operator>
-  ag_internal.__dict__.update(operators.__dict__)
+  """Adds namespace references to the module that exposes the api itself."""
+  global ag_internal
+  if ag_internal is None:
+    # Craft a module that exposes parts of the external API as well as certain
+    # internal modules.
+    ag_internal = imp.new_module('autograph')
+    ag_internal.converted_call = api_module.converted_call
+    ag_internal.utils = utils
+    # TODO(mdan): Add safeguards against name clashes.
+    # We don't want to create a submodule because we want the operators to be
+    # accessible as ag__.<operator>
+    ag_internal.__dict__.update(operators.__dict__)
 
   _add_reserved_symbol(namespace, 'ag__', ag_internal)
 
diff --git a/tensorflow/contrib/autograph/impl/conversion_test.py b/tensorflow/contrib/autograph/impl/conversion_test.py
index f0b597c12f..da3220892f 100644
--- a/tensorflow/contrib/autograph/impl/conversion_test.py
+++ b/tensorflow/contrib/autograph/impl/conversion_test.py
@@ -78,6 +78,28 @@ class ConversionTest(test.TestCase):
         conversion_map.dependency_cache[f].body[0].body[0].value.func.id)
     self.assertEqual('tf__g', conversion_map.dependency_cache[g].name)
 
+  def test_entity_to_graph_lambda(self):
+    f = lambda a: a
+
+    with self.assertRaises(NotImplementedError):
+      conversion_map = self._simple_conversion_map()
+      conversion.entity_to_graph(f, conversion_map, None, None)
+
+  def test_ag_module_cached(self):
+    def callee():
+      return range(3)
+
+    def caller(a):
+      return a()
+
+    conversion_map = self._simple_conversion_map()
+    _, _, callee_ns = conversion.entity_to_graph(
+        callee, conversion_map, None, None)
+    _, _, caller_ns = conversion.entity_to_graph(
+        caller, conversion_map, None, None)
+
+    self.assertTrue(callee_ns['ag__'] is caller_ns['ag__'])
+
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From 83418120b7c2659fedddd7c85b65d3c3e6aa94e3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 13:20:42 -0700
Subject: [PATCH 036/450] Fixing a bug in strided slice. The op was not
 handling negative indices correctly.

PiperOrigin-RevId: 193245539
---
 .../internal/optimized/optimized_ops.h        | 144 +++++++++++++----
 .../internal/reference/reference_ops.h        | 150 +++++++++++++-----
 .../contrib/lite/kernels/strided_slice.cc     |  22 +--
 .../lite/kernels/strided_slice_test.cc        |  37 ++---
 .../contrib/lite/testing/generate_examples.py |  14 +-
 .../propagate_fixed_sizes.cc                  | 144 +++++++++++++----
 .../resolve_constant_strided_slice.cc         |  93 ++++++++---
 tensorflow/contrib/lite/toco/model.h          |  55 +++++++
 tensorflow/contrib/lite/toco/toco_port.h      |   8 +
 9 files changed, 493 insertions(+), 174 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 7fc6615965..d269056800 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -5561,43 +5561,127 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
       output_dims, 0);
 }
 
+// UNOPTIMIZED COPY of StridedSlice from reference_ops.h (see comments there).
+
+// Use until std::clamp() is available from C++17.
+inline int Clamp(const int v, const int lo, const int hi) {
+  TFLITE_DCHECK(!(hi < lo));
+  if (hi < v) return hi;
+  if (v < lo) return lo;
+  return v;
+}
+
+inline int StartForAxis(int begin_mask, const std::vector<int>& start_indices,
+                        const std::vector<int>& strides,
+                        const Dims<4>& input_shape, int axis) {
+  // Begin with the specified index
+  int start = start_indices[axis];
+
+  // begin_mask override
+  if (begin_mask & 1 << axis) {
+    if (strides[axis] > 0) {
+      // Forward iteration - use the first element. These values will get
+      // clamped below (Note: We could have set them to 0 and axis_size-1, but
+      // use lowest() and max() to maintain symmetry with StopForAxis())
+      start = std::numeric_limits<int>::lowest();
+    } else {
+      // Backward iteration - use the last element.
+      start = std::numeric_limits<int>::max();
+    }
+  }
+
+  // Handle negative indices
+  int axis_size = input_shape.sizes[axis];
+  if (start < 0) {
+    start += axis_size;
+  }
+
+  // Clamping
+  start = Clamp(start, 0, axis_size - 1);
+
+  return start;
+}
+
+inline int StopForAxis(int end_mask, const std::vector<int>& stop_indices,
+                       const std::vector<int>& strides,
+                       const Dims<4>& input_shape, int axis) {
+  // Begin with the specified index
+  int stop = stop_indices[axis];
+
+  // end_mask override
+  if (end_mask & (1 << axis)) {
+    if (strides[axis] > 0) {
+      // Forward iteration - use the last element. These values will get
+      // clamped below
+      stop = std::numeric_limits<int>::max();
+    } else {
+      // Backward iteration - use the first element.
+      stop = std::numeric_limits<int>::lowest();
+    }
+  }
+
+  // Handle negative indices
+  int axis_size = input_shape.sizes[axis];
+  if (stop < 0) {
+    stop += axis_size;
+  }
+
+  // Clamping
+  // Because the end index points one past the last element, we need slightly
+  // different clamping ranges depending on the direction.
+  if (strides[axis] > 0) {
+    // Forward iteration
+    stop = Clamp(stop, 0, axis_size);
+  } else {
+    // Backward iteration
+    stop = Clamp(stop, -1, axis_size - 1);
+  }
+
+  return stop;
+}
+
+inline bool LoopCondition(int index, int stop, int stride) {
+  // True when we have reached the end of an axis and should loop.
+  return stride > 0 ? index >= stop : index <= stop;
+}
+
 template <typename T>
 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
                          int begin_mask, int end_mask,
-                         const std::vector<int>& starts,
-                         const std::vector<int>& stops,
+                         const std::vector<int>& start_indices,
+                         const std::vector<int>& stop_indices,
                          const std::vector<int>& strides, T* output_data,
                          const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("StridedSlice");
-  const int start_b = (begin_mask & 8) ? 0 : starts[3];
-  const int stop_b = (end_mask & 8) ? input_dims.sizes[3] : stops[3];
-  const int start_h = (begin_mask & 4) ? 0 : starts[2];
-  const int stop_h = (end_mask & 4) ? input_dims.sizes[2] : stops[2];
-  const int start_w = (begin_mask & 2) ? 0 : starts[1];
-  const int stop_w = (end_mask & 2) ? input_dims.sizes[1] : stops[1];
-  const int start_d = (begin_mask & 1) ? 0 : starts[0];
-  const int stop_d = (end_mask & 1) ? input_dims.sizes[0] : stops[0];
+  TFLITE_DCHECK_EQ(start_indices.size(), 4);
+  TFLITE_DCHECK_EQ(stop_indices.size(), 4);
+  TFLITE_DCHECK_EQ(strides.size(), 4);
+  const int start_b =
+      StartForAxis(begin_mask, start_indices, strides, input_dims, 3);
+  const int stop_b =
+      StopForAxis(end_mask, stop_indices, strides, input_dims, 3);
+  const int start_h =
+      StartForAxis(begin_mask, start_indices, strides, input_dims, 2);
+  const int stop_h =
+      StopForAxis(end_mask, stop_indices, strides, input_dims, 2);
+  const int start_w =
+      StartForAxis(begin_mask, start_indices, strides, input_dims, 1);
+  const int stop_w =
+      StopForAxis(end_mask, stop_indices, strides, input_dims, 1);
+  const int start_d =
+      StartForAxis(begin_mask, start_indices, strides, input_dims, 0);
+  const int stop_d =
+      StopForAxis(end_mask, stop_indices, strides, input_dims, 0);
 
   T* out_ptr = output_data;
-  if (strides[0] == 0) {
-    for (int in_b = start_b; in_b < stop_b; in_b += strides[3]) {
-      for (int in_h = start_h; in_h < stop_h; in_h += strides[2]) {
-        for (int in_w = start_w; in_w < stop_w; in_w += strides[1]) {
-          const int len = stop_d - start_d;
-          memcpy(out_ptr,
-                 input_data + Offset(input_dims, start_d, in_w, in_h, in_b),
-                 len * sizeof(T));
-          out_ptr += len;
-        }
-      }
-    }
-  } else {
-    for (int in_b = start_b; in_b < stop_b; in_b += strides[3]) {
-      for (int in_h = start_h; in_h < stop_h; in_h += strides[2]) {
-        for (int in_w = start_w; in_w < stop_w; in_w += strides[1]) {
-          for (int in_d = start_d; in_d < stop_d; in_d += strides[0]) {
-            *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
-          }
+  for (int in_b = start_b; !LoopCondition(in_b, stop_b, strides[3]);
+       in_b += strides[3]) {
+    for (int in_h = start_h; !LoopCondition(in_h, stop_h, strides[2]);
+         in_h += strides[2]) {
+      for (int in_w = start_w; !LoopCondition(in_w, stop_w, strides[1]);
+           in_w += strides[1]) {
+        for (int in_d = start_d; !LoopCondition(in_d, stop_d, strides[0]);
+             in_d += strides[0]) {
+          *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
         }
       }
     }
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 791fb52391..49a93b0c6d 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -3026,59 +3026,139 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
       output_dims, 0);
 }
 
-inline bool LoopCondition(int index, int stop, int stride) {
-  return stride > 0 ? index < stop : index > stop;
+// STRIDED SLICE
+// The functions below for StridedSlice are mirrored in a number of places:
+//
+//   propagate_fixed_sizes.cc
+//   propagate_shapes.cc
+//   resolve_constant_strided_slice.cc
+//   optimized_ops.h
+//
+// It is designed for an arbitrary number of dimensions, even though dimensions
+// here are fixed at 4. This is because we expect to eventually support
+// arbitrary dimensionality. Also note that the axis orders are reversed for
+// runtime ops, and so the indices and masks must be as well too.
+//
+// Be warned this code involves some rather subtle logic of python slicing. The
+// best "ground truth" is to compare results to actual python execution.
+
+// Use until std::clamp() is available from C++17.
+inline int Clamp(const int v, const int lo, const int hi) {
+  TFLITE_DCHECK(!(hi < lo));
+  if (hi < v) return hi;
+  if (v < lo) return lo;
+  return v;
+}
+
+inline int StartForAxis(int begin_mask, const std::vector<int>& start_indices,
+                        const std::vector<int>& strides,
+                        const Dims<4>& input_shape, int axis) {
+  // Begin with the specified index
+  int start = start_indices[axis];
+
+  // begin_mask override
+  if (begin_mask & 1 << axis) {
+    if (strides[axis] > 0) {
+      // Forward iteration - use the first element. These values will get
+      // clamped below (Note: We could have set them to 0 and axis_size-1, but
+      // use lowest() and max() to maintain symmetry with StopForAxis())
+      start = std::numeric_limits<int>::lowest();
+    } else {
+      // Backward iteration - use the last element.
+      start = std::numeric_limits<int>::max();
+    }
+  }
+
+  // Handle negative indices
+  int axis_size = input_shape.sizes[axis];
+  if (start < 0) {
+    start += axis_size;
+  }
+
+  // Clamping
+  start = Clamp(start, 0, axis_size - 1);
+
+  return start;
 }
 
-inline int StartIndex(int start, int stride, int dim, bool masked) {
-  return masked ? (stride > 0 ? 0 : dim - 1) : start;
+inline int StopForAxis(int end_mask, const std::vector<int>& stop_indices,
+                       const std::vector<int>& strides,
+                       const Dims<4>& input_shape, int axis) {
+  // Begin with the specified index
+  int stop = stop_indices[axis];
+
+  // end_mask override
+  if (end_mask & (1 << axis)) {
+    if (strides[axis] > 0) {
+      // Forward iteration - use the last element. These values will get
+      // clamped below
+      stop = std::numeric_limits<int>::max();
+    } else {
+      // Backward iteration - use the first element.
+      stop = std::numeric_limits<int>::lowest();
+    }
+  }
+
+  // Handle negative indices
+  int axis_size = input_shape.sizes[axis];
+  if (stop < 0) {
+    stop += axis_size;
+  }
+
+  // Clamping
+  // Because the end index points one past the last element, we need slightly
+  // different clamping ranges depending on the direction.
+  if (strides[axis] > 0) {
+    // Forward iteration
+    stop = Clamp(stop, 0, axis_size);
+  } else {
+    // Backward iteration
+    stop = Clamp(stop, -1, axis_size - 1);
+  }
+
+  return stop;
 }
 
-inline int StopIndex(int start, int stop, int stride, int dim, bool masked,
-                     bool shrink_axis_masked) {
-  return shrink_axis_masked ? stride > 0 ? start + 1 : start - 1
-                            : masked ? (stride > 0 ? dim : -1) : stop;
+inline bool LoopCondition(int index, int stop, int stride) {
+  // True when we have reached the end of an axis and should loop.
+  return stride > 0 ? index >= stop : index <= stop;
 }
 
 template <typename T>
 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
-                         int begin_mask, int end_mask, int shrink_axis_mask,
-                         const std::vector<int>& starts,
-                         const std::vector<int>& stops,
+                         int begin_mask, int end_mask,
+                         const std::vector<int>& start_indices,
+                         const std::vector<int>& stop_indices,
                          const std::vector<int>& strides, T* output_data,
                          const Dims<4>& output_dims) {
-  TFLITE_DCHECK_EQ(starts.size(), 4);
-  TFLITE_DCHECK_EQ(stops.size(), 4);
+  TFLITE_DCHECK_EQ(start_indices.size(), 4);
+  TFLITE_DCHECK_EQ(stop_indices.size(), 4);
   TFLITE_DCHECK_EQ(strides.size(), 4);
   const int start_b =
-      StartIndex(starts[3], strides[3], input_dims.sizes[3], begin_mask & 8);
+      StartForAxis(begin_mask, start_indices, strides, input_dims, 3);
   const int stop_b =
-      StopIndex(start_b, stops[3], strides[3], input_dims.sizes[3],
-                end_mask & 8, shrink_axis_mask & 8);
+      StopForAxis(end_mask, stop_indices, strides, input_dims, 3);
   const int start_h =
-      StartIndex(starts[2], strides[2], input_dims.sizes[2], begin_mask & 4);
+      StartForAxis(begin_mask, start_indices, strides, input_dims, 2);
   const int stop_h =
-      StopIndex(start_h, stops[2], strides[2], input_dims.sizes[2],
-                end_mask & 4, shrink_axis_mask & 4);
+      StopForAxis(end_mask, stop_indices, strides, input_dims, 2);
   const int start_w =
-      StartIndex(starts[1], strides[1], input_dims.sizes[1], begin_mask & 2);
+      StartForAxis(begin_mask, start_indices, strides, input_dims, 1);
   const int stop_w =
-      StopIndex(start_w, stops[1], strides[1], input_dims.sizes[1],
-                end_mask & 2, shrink_axis_mask & 2);
+      StopForAxis(end_mask, stop_indices, strides, input_dims, 1);
   const int start_d =
-      StartIndex(starts[0], strides[0], input_dims.sizes[0], begin_mask & 1);
+      StartForAxis(begin_mask, start_indices, strides, input_dims, 0);
   const int stop_d =
-      StopIndex(start_d, stops[0], strides[0], input_dims.sizes[0],
-                end_mask & 1, shrink_axis_mask & 1);
+      StopForAxis(end_mask, stop_indices, strides, input_dims, 0);
 
   T* out_ptr = output_data;
-  for (int in_b = start_b; LoopCondition(in_b, stop_b, strides[3]);
+  for (int in_b = start_b; !LoopCondition(in_b, stop_b, strides[3]);
        in_b += strides[3]) {
-    for (int in_h = start_h; LoopCondition(in_h, stop_h, strides[2]);
+    for (int in_h = start_h; !LoopCondition(in_h, stop_h, strides[2]);
          in_h += strides[2]) {
-      for (int in_w = start_w; LoopCondition(in_w, stop_w, strides[1]);
+      for (int in_w = start_w; !LoopCondition(in_w, stop_w, strides[1]);
            in_w += strides[1]) {
-        for (int in_d = start_d; LoopCondition(in_d, stop_d, strides[0]);
+        for (int in_d = start_d; !LoopCondition(in_d, stop_d, strides[0]);
              in_d += strides[0]) {
           *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
         }
@@ -3087,18 +3167,6 @@ inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
-template <typename T>
-inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
-                         int begin_mask, int end_mask,
-                         const std::vector<int>& starts,
-                         const std::vector<int>& stops,
-                         const std::vector<int>& strides, T* output_data,
-                         const Dims<4>& output_dims) {
-  StridedSlice(input_data, input_dims, begin_mask, end_mask,
-               /*shrink_axis_mask=*/0, starts, stops, strides, output_data,
-               output_dims);
-}
-
 template <typename T>
 inline void Slice(const T* input_data, const Dims<4>& input_dims,
                   const std::vector<int>& begin, const std::vector<int>& size,
diff --git a/tensorflow/contrib/lite/kernels/strided_slice.cc b/tensorflow/contrib/lite/kernels/strided_slice.cc
index e6d5c300dc..40ac436b7d 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice.cc
@@ -87,6 +87,8 @@ inline int32_t ClampedIndex(int32_t index, int dim, bool pos_stride) {
                           std::min(std::max(index, -dim), dim - 1), dim));
 }
 
+// TODO(b/77971377) this logic should be removed, as it's a duplication of
+// StartForAxis() & StopForAxis() in kernels/internal/reference/reference_ops.h
 inline int32_t GetBeginValueAtIndex(StridedSliceContext* op_context, int idx) {
   const int dim = op_context->input->dims->data[idx];
   const bool pos_stride = GetTensorData<int32_t>(op_context->strides)[idx] > 0;
@@ -188,8 +190,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   std::vector<int32_t> strides;
 
   for (int idx = op_context.dims - 1; idx >= 0; --idx) {
-    starts.emplace_back(GetBeginValueAtIndex(&op_context, idx));
-    stops.emplace_back(GetEndValueAtIndex(&op_context, idx));
+    starts.emplace_back(GetTensorData<int32_t>(op_context.begin)[idx]);
+    stops.emplace_back(GetTensorData<int32_t>(op_context.end)[idx]);
     strides.emplace_back(GetTensorData<int32_t>(op_context.strides)[idx]);
   }
 
@@ -202,15 +204,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   int begin_mask =
       ReverseMaskBits(op_context.params->begin_mask, op_context.dims);
   int end_mask = ReverseMaskBits(op_context.params->end_mask, op_context.dims);
-  int shrink_axis_mask =
-      ReverseMaskBits(op_context.params->shrink_axis_mask, op_context.dims);
-
-#define TF_LITE_STRIDED_SLICE(kernel_type, data_type)                          \
-  kernel_type::StridedSlice(                                                   \
-      GetTensorData<data_type>(op_context.input),                              \
-      GetTensorDims(op_context.input), begin_mask, end_mask, shrink_axis_mask, \
-      starts, stops, strides, GetTensorData<data_type>(op_context.output),     \
-      GetTensorDims(op_context.output))
+
+#define TF_LITE_STRIDED_SLICE(kernel_type, data_type)                    \
+  kernel_type::StridedSlice(GetTensorData<data_type>(op_context.input),  \
+                            GetTensorDims(op_context.input), begin_mask, \
+                            end_mask, starts, stops, strides,            \
+                            GetTensorData<data_type>(op_context.output), \
+                            GetTensorDims(op_context.output))
 
   switch (op_context.input->type) {
     case kTfLiteFloat32:
diff --git a/tensorflow/contrib/lite/kernels/strided_slice_test.cc b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
index 22d7b097cb..cc39179bc7 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
@@ -377,29 +377,18 @@ TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1) {
   StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
-  m.SetEnd({3});
+  m.SetEnd({2});
   m.SetStrides({1});
   m.Invoke();
   EXPECT_TRUE(m.GetOutputShape().empty());
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({2}));
 }
 
-TEST(StridedSliceOpTest, In1D_EmptyOutputShrinkAxisMask1) {
-  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
-  m.SetInput({1, 2, 3, 4});
-  m.SetBegin({2});
-  m.SetEnd({1});
-  m.SetStrides({1});
-  m.Invoke();
-  EXPECT_TRUE(m.GetOutputShape().empty());
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3}));
-}
-
 TEST(StridedSliceOpTest, In1D_BeginMaskShrinkAxisMask1) {
   StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
-  m.SetEnd({3});
+  m.SetEnd({1});
   m.SetStrides({1});
   m.Invoke();
   EXPECT_TRUE(m.GetOutputShape().empty());
@@ -421,7 +410,7 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask1) {
   StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
-  m.SetEnd({2, 3});
+  m.SetEnd({1, 3});
   m.SetStrides({1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
@@ -432,7 +421,7 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask2) {
   StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 2);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
-  m.SetEnd({2, 3});
+  m.SetEnd({2, 1});
   m.SetStrides({1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
@@ -443,7 +432,7 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask3) {
   StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
-  m.SetEnd({2, 3});
+  m.SetEnd({1, 1});
   m.SetStrides({1, 1});
   m.Invoke();
   EXPECT_TRUE(m.GetOutputShape().empty());
@@ -454,7 +443,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1) {
   StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({1, 3, 2});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
@@ -465,7 +454,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis2) {
   StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 2);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({2, 1, 2});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2}));
@@ -476,7 +465,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis3) {
   StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 3);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({1, 1, 2});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
@@ -487,7 +476,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) {
   StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 4);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({2, 3, 1});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
@@ -498,7 +487,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis5) {
   StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 5);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({1, 3, 1});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
@@ -509,7 +498,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis6) {
   StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 6);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({2, 1, 1});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
@@ -520,7 +509,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis7) {
   StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 7);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({1, 1, 1});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_TRUE(m.GetOutputShape().empty());
@@ -553,7 +542,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1Uint8) {
                                                  0, 0, 1);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({1, 3, 2});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index e045c27427..f72a4e0d8c 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -1758,19 +1758,7 @@ def make_strided_slice_tests(zip_path):
           "shrink_axis_mask": [None, 1, 8, 11, 15, -1],
           "constant_indices": [False, True],
       },
-      #
-      {
-          "dtype": [tf.float32],
-          "index_type": [tf.int32],
-          "input_shape": [[12, 2, 2, 5]],
-          "begin": [[0]],
-          "end": [[1]],
-          "strides": [[1]],
-          "begin_mask": [0],
-          "end_mask": [0],
-          "shrink_axis_mask": [1],
-          "constant_indices": [True],
-      },
+      # TODO(b/73170889) Restore test paramaters removed in cl/191608113.
       # 2-D
       {
           "dtype": [tf.float32, tf.int32, tf.int64],
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 9191e69662..b34aca1f09 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1253,6 +1253,83 @@ void ProcessStackOperator(Model* model, StackOperator* op) {
   output_array.copy_shape(*stacked_shape);
 }
 
+// These StridedSlice utility functions are essentially a COPY of those in
+// reference_ops.h. See comments there.
+
+// Use until std::clamp() is available from C++17.
+int Clamp(const int v, const int lo, const int hi) {
+  if (hi < v) return hi;
+  if (v < lo) return lo;
+  return v;
+}
+
+int StartForAxis(StridedSliceOperator const& op, Shape const& input_shape,
+                 int axis) {
+  // Begin with the specified index
+  int start = op.start_indices[axis];
+
+  // begin_mask override
+  if (op.begin_mask & 1 << axis) {
+    if (op.strides[axis] > 0) {
+      // Forward iteration - use the first element. These values will get
+      // clamped below (Note: We could have set them to 0 and axis_size-1, but
+      // use lowest() and max() to maintain symmetry with StopForAxis())
+      start = std::numeric_limits<int>::lowest();
+    } else {
+      // Backward iteration - use the last element.
+      start = std::numeric_limits<int>::max();
+    }
+  }
+
+  // Handle negative indices
+  int axis_size = input_shape.dims(axis);
+  if (start < 0) {
+    start += axis_size;
+  }
+
+  // Clamping
+  start = Clamp(start, 0, axis_size - 1);
+
+  return start;
+}
+
+int StopForAxis(StridedSliceOperator const& op, Shape const& input_shape,
+                int axis) {
+  // Begin with the specified index
+  int stop = op.stop_indices[axis];
+
+  // end_mask override
+  if (op.end_mask & (1 << axis)) {
+    if (op.strides[axis] > 0) {
+      // Forward iteration - use the last element. These values will get
+      // clamped below
+      stop = std::numeric_limits<int>::max();
+    } else {
+      // Backward iteration - use the first element.
+      stop = std::numeric_limits<int>::lowest();
+    }
+  }
+
+  // Handle negative indices
+  int axis_size = input_shape.dims(axis);
+  if (stop < 0) {
+    stop += axis_size;
+  }
+
+  // Clamping
+  // Because the end index points one past the last element, we need slightly
+  // different clamping ranges depending on the direction.
+  if (op.strides[axis] > 0) {
+    // Forward iteration
+    stop = Clamp(stop, 0, axis_size);
+  } else {
+    // Backward iteration
+    stop = Clamp(stop, -1, axis_size - 1);
+  }
+
+  return stop;
+}
+
 void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) {
   CHECK_GE(op->inputs.size(), 1);
   CHECK_EQ(op->outputs.size(), 1);
@@ -1290,43 +1367,46 @@ void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) {
     return;
   }
 
-  int dim_count = input_array.shape().dimensions_count();
-  CHECK(op->start_indices.size() == dim_count)
-      << ": Incorrect number of start indices supplied to StridedSlice op with "
-         "output \""
-      << op->outputs[0] << "\". Op requires " << dim_count << " start indices";
-  CHECK(op->stop_indices.size() == dim_count)
-      << ": Incorrect number of stop indices supplied to StridedSlice op with "
-         "output \""
-      << op->outputs[0] << "\". Op requires " << dim_count << " stop indices";
-  CHECK(op->strides.size() == dim_count)
-      << ": Incorrect number of strides supplied to StridedSlice op with "
-         " output \""
-      << op->outputs[0] << "\". Op requires " << dim_count << " strides";
+  int num_input_axes = input_array.shape().dimensions_count();
+  CHECK_LE(op->start_indices.size(), num_input_axes)
+      << "StridedSlice op with output \"" << op->outputs[0]
+      << "\", requires no more than " << num_input_axes << " start indices";
+  CHECK_LE(op->stop_indices.size(), num_input_axes)
+      << "StridedSlice op with output \"" << op->outputs[0]
+      << "\", requires no more than " << num_input_axes << " stop indices";
+  CHECK_LE(op->strides.size(), num_input_axes)
+      << "StridedSlice op with output \"" << op->outputs[0]
+      << "\", requires no more than " << num_input_axes << " strides";
+  for (int i = 0; i < op->strides.size(); i++) {
+    CHECK_NE(op->strides[i], 0) << "Strides must be non-zero. Axis " << i
+                                << " has stride=" << op->strides[i] << ".";
+  }
+
+  // The TensorFlow documentation is not explicit on how it handles fewer
+  // supplied indices than dimensions, but they are accepted. We emulate TF's
+  // behavior by fully iterating over each "forgotten" dimension.
+  op->PadIndices(num_input_axes);
 
   // Create output shape
   std::vector<int>* dims = output_array.mutable_shape()->mutable_dims();
 
   // Compute output shape
-  for (int i = 0; i < dim_count; ++i) {
-    const int mask = 1 << i;
-    int start = (op->begin_mask & mask) ? 0 : op->start_indices[i];
-    if (start < 0) {
-      // handle negative indices
-      start += input_array.shape().dims(i);
-    }
-    int stop = (op->end_mask & mask) ? input_array.shape().dims(i)
-                                     : op->stop_indices[i];
-    if (stop < 0) {
-      // handle negative indices
-      stop += input_array.shape().dims(i);
-    }
-
-    int dim_size = ceil((stop - start) / static_cast<float>(op->strides[i]));
-    dim_size = dim_size < 0 ? 0 : dim_size;
-    if (op->shrink_axis_mask & mask) {
-      CHECK_EQ(dim_size, 1) << "Output size for an axis must compute to 1 when "
-                               "shrinking that axis";
+  for (int axis = 0; axis < num_input_axes; ++axis) {
+    int start_index = StartForAxis(*op, input_array.shape(), axis);
+    int stop_index = StopForAxis(*op, input_array.shape(), axis);
+    int dim_size =
+        ceil(static_cast<float>(stop_index - start_index) / op->strides[axis]);
+
+    CHECK_GT(dim_size, 0)
+        << "Output size for an axis must be greater than 0. Axis " << axis
+        << " computes to size " << dim_size
+        << " for StridedSlice op with output \"" << op->outputs[0] << "\".";
+    if (op->shrink_axis_mask & (1 << axis)) {
+      CHECK_EQ(dim_size, 1)
+          << "Output size for an axis must compute to 1 when shrinking an "
+             "axis. Axis "
+          << axis << " computes to size " << dim_size
+          << " for StridedSlice op with output \"" << op->outputs[0] << "\".";
     } else {
       dims->push_back(dim_size);
     }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
index a0cfc3d597..8df3c2f195 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
@@ -23,40 +23,88 @@ namespace toco {
 
 namespace {
 
+// These StridedSlice utility functions are essentially a COPY of those in
+// reference_ops.h. See comments there.
+
+// Use until std::clamp() is available from C++17.
+int Clamp(const int v, const int lo, const int hi) {
+  if (hi < v) return hi;
+  if (v < lo) return lo;
+  return v;
+}
+
 int StartForAxis(StridedSliceOperator const& op, Shape const& input_shape,
                  int axis) {
-  int start;
+  // Begin with the specified index
+  int start = op.start_indices[axis];
+
+  // begin_mask override
   if (op.begin_mask & 1 << axis) {
-    // If begin mask bit is set, use the first element
-    start = 0;
-  } else {
-    // Otherwise, use the specified element
-    start = op.start_indices[axis];
-    if (start < 0) {
-      // Handle negative indices
-      start += input_shape.dims(axis);
+    if (op.strides[axis] > 0) {
+      // Forward iteration - use the first element. These values will get
+      // clamped below (Note: We could have set them to 0 and axis_size-1, but
+      // use lowest() and max() to maintain symmetry with StopForAxis())
+      start = std::numeric_limits<int>::lowest();
+    } else {
+      // Backward iteration - use the last element.
+      start = std::numeric_limits<int>::max();
     }
   }
+
+  // Handle negative indices
+  int axis_size = input_shape.dims(axis);
+  if (start < 0) {
+    start += axis_size;
+  }
+
+  // Clamping
+  start = Clamp(start, 0, axis_size - 1);
+
   return start;
 }
 
 int StopForAxis(StridedSliceOperator const& op, Shape const& input_shape,
                 int axis) {
-  int stop;
+  // Begin with the specified index
+  int stop = op.stop_indices[axis];
+
+  // end_mask override
   if (op.end_mask & (1 << axis)) {
-    // If end mask bit set, use the last element
-    stop = input_shape.dims(axis);
-  } else {
-    // Otherwise, use the specified element
-    stop = op.stop_indices[axis];
-    if (stop < 0) {
-      // Handle negative indices
-      stop += input_shape.dims(axis);
+    if (op.strides[axis] > 0) {
+      // Forward iteration - use the last element. These values will get
+      // clamped below
+      stop = std::numeric_limits<int>::max();
+    } else {
+      // Backward iteration - use the first element.
+      stop = std::numeric_limits<int>::lowest();
     }
   }
+
+  // Handle negative indices
+  int axis_size = input_shape.dims(axis);
+  if (stop < 0) {
+    stop += axis_size;
+  }
+
+  // Clamping
+  // Because the end index points one past the last element, we need slightly
+  // different clamping ranges depending on the direction.
+  if (op.strides[axis] > 0) {
+    // Forward iteration
+    stop = Clamp(stop, 0, axis_size);
+  } else {
+    // Backward iteration
+    stop = Clamp(stop, -1, axis_size - 1);
+  }
+
   return stop;
 }
 
+bool LoopCondition(int index, int stop, int stride) {
+  // True when we have reached the end of an axis and should loop.
+  return stride > 0 ? index >= stop : index <= stop;
+}
+
 template <ArrayDataType Type>
 void StridedSlice(StridedSliceOperator const& op, Array const& input_array,
                   Array* output_array) {
@@ -73,9 +121,6 @@ void StridedSlice(StridedSliceOperator const& op, Array const& input_array,
   int num_input_axes = op.start_indices.size();
   CHECK_EQ(num_input_axes, op.stop_indices.size());
   CHECK_EQ(num_input_axes, op.strides.size());
-  for (int i = 0; i < op.strides.size(); i++) {
-    CHECK_GE(op.strides[i], 0) << "Negative strides usupported";
-  }
 
   // Create a buffer for the output array
   std::vector<DataType<Type>>& output_data =
@@ -103,13 +148,15 @@ void StridedSlice(StridedSliceOperator const& op, Array const& input_array,
     // Compute next source input coordinates.
     bool carry = true;
     for (int axis = 0; axis < num_input_axes; axis++) {
+      int stride = op.strides[axis];
       // Increment this axis if we carried from the previous one
       if (carry) {
-        src_coord[axis] += op.strides[axis];
+        src_coord[axis] += stride;
       }
 
       // Check if we've overflowed.
-      if (src_coord[axis] >= StopForAxis(op, input_shape, axis)) {
+      int stop = StopForAxis(op, input_shape, axis);
+      if (LoopCondition(src_coord[axis], stop, stride)) {
         // Reset axis and set carry
         src_coord[axis] = StartForAxis(op, input_shape, axis);
         carry = true;
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 1c4c96ae70..705a9d69a6 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/lite/toco/model_flags.pb.h"
 #include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/toco_port.h"
 #include "tensorflow/contrib/lite/toco/toco_types.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -845,6 +846,60 @@ struct StridedSliceOperator : Operator {
   int end_mask;
   int new_axis_mask;
   int shrink_axis_mask;
+
+  StridedSliceOperator(const StridedSliceOperator& other)
+      : Operator(OperatorType::kStridedSlice) {
+    inputs = other.inputs;
+    outputs = other.outputs;
+
+    start_indices = other.start_indices;
+    stop_indices = other.stop_indices;
+    strides = other.strides;
+
+    begin_mask = other.begin_mask;
+    ellipsis_mask = other.ellipsis_mask;
+    end_mask = other.end_mask;
+    new_axis_mask = other.new_axis_mask;
+    shrink_axis_mask = other.shrink_axis_mask;
+  }
+
+  void PadIndices(int dim_count) {
+    // Add indices and mask bits to fully include extra dimensions
+    CHECK_GE(dim_count, start_indices.size());
+    CHECK_EQ(start_indices.size(), stop_indices.size());
+    CHECK_EQ(stop_indices.size(), strides.size());
+
+    for (int i = start_indices.size(); i < dim_count; i++) {
+      start_indices.push_back(0);
+      stop_indices.push_back(0);
+      strides.push_back(1);
+      begin_mask |= 1 << i;
+      end_mask |= 1 << i;
+    }
+  }
+
+  void ReverseIndices() {
+    CHECK_EQ(start_indices.size(), stop_indices.size());
+    CHECK_EQ(stop_indices.size(), strides.size());
+
+    std::reverse(start_indices.begin(), start_indices.end());
+    std::reverse(stop_indices.begin(), stop_indices.end());
+    std::reverse(strides.begin(), strides.end());
+
+    begin_mask = toco::port::ReverseBits32(static_cast<uint32>(begin_mask)) >>
+                 (32 - start_indices.size());
+    ellipsis_mask =
+        toco::port::ReverseBits32(static_cast<uint32>(ellipsis_mask)) >>
+        (32 - start_indices.size());
+    end_mask = toco::port::ReverseBits32(static_cast<uint32>(end_mask)) >>
+               (32 - start_indices.size());
+    new_axis_mask =
+        toco::port::ReverseBits32(static_cast<uint32>(new_axis_mask)) >>
+        (32 - start_indices.size());
+    shrink_axis_mask =
+        toco::port::ReverseBits32(static_cast<uint32>(shrink_axis_mask)) >>
+        (32 - start_indices.size());
+  }
 };
 
 // Reshaping operator, reshaping its input array to a two-dimensional shape
diff --git a/tensorflow/contrib/lite/toco/toco_port.h b/tensorflow/contrib/lite/toco/toco_port.h
index 4be3b5a0bf..2d5c231bef 100644
--- a/tensorflow/contrib/lite/toco/toco_port.h
+++ b/tensorflow/contrib/lite/toco/toco_port.h
@@ -75,6 +75,14 @@ Status Exists(const string& filename, const Options& options);
 void CopyToBuffer(const ::Cord& src, char* dest);
 #endif  // PLATFORM_GOOGLE
 void CopyToBuffer(const string& src, char* dest);
+
+inline uint32 ReverseBits32(uint32 n) {
+  n = ((n >> 1) & 0x55555555) | ((n & 0x55555555) << 1);
+  n = ((n >> 2) & 0x33333333) | ((n & 0x33333333) << 2);
+  n = ((n >> 4) & 0x0F0F0F0F) | ((n & 0x0F0F0F0F) << 4);
+  return (((n & 0xFF) << 24) | ((n & 0xFF00) << 8) | ((n & 0xFF0000) >> 8) |
+          ((n & 0xFF000000) >> 24));
+}
 }  // namespace port
 
 inline bool ParseFromStringOverload(const std::string& in,
-- 
GitLab


From 84b6dac70710075e67fcf40ccd29033335d63f83 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Tue, 17 Apr 2018 13:27:46 -0700
Subject: [PATCH 037/450] Internal change.

PiperOrigin-RevId: 193246563
---
 tensorflow/contrib/timeseries/examples/BUILD | 33 +++++++++++++++-----
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/timeseries/examples/BUILD b/tensorflow/contrib/timeseries/examples/BUILD
index 32e948a009..355303acf6 100644
--- a/tensorflow/contrib/timeseries/examples/BUILD
+++ b/tensorflow/contrib/timeseries/examples/BUILD
@@ -8,14 +8,22 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+config_setting(
+    name = "empty_condition",
+    values = {"define": "UNUSED=unused"},
+)
+
 py_binary(
     name = "predict",
     srcs = ["predict.py"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
-    deps = [
-        "//tensorflow:tensorflow_py",
+    deps = select({
+        ":empty_condition": [],
+        "//conditions:default": [],
+    }) + [
         "//third_party/py/numpy",
+        "//tensorflow:tensorflow_py",
     ],
 )
 
@@ -41,9 +49,12 @@ py_binary(
     data = ["data/changepoints.csv"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
-    deps = [
-        "//tensorflow:tensorflow_py",
+    deps = select({
+        ":empty_condition": [],
+        "//conditions:default": [],
+    }) + [
         "//third_party/py/numpy",
+        "//tensorflow:tensorflow_py",
     ],
 )
 
@@ -64,9 +75,12 @@ py_binary(
     data = ["data/multivariate_level.csv"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
-    deps = [
-        "//tensorflow:tensorflow_py",
+    deps = select({
+        ":empty_condition": [],
+        "//conditions:default": [],
+    }) + [
         "//third_party/py/numpy",
+        "//tensorflow:tensorflow_py",
     ],
 )
 
@@ -89,11 +103,14 @@ py_binary(
     data = ["data/multivariate_periods.csv"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
-    deps = [
+    deps = select({
+        ":empty_condition": [],
+        "//conditions:default": [],
+    }) + [
+        "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/timeseries/python/timeseries:estimators",
         "//tensorflow/contrib/timeseries/python/timeseries:model",
-        "//third_party/py/numpy",
     ],
 )
 
-- 
GitLab


From 197572bd517a4bc6f4850dfecf3288818d8d84ee Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 13:30:45 -0700
Subject: [PATCH 038/450] Unpack multiple assignments when processing lists.
 This supports the cases "a, b = [], []" and "[a, b] = [], []". Also expand
 the static analysis to support list unpacking constructs.

PiperOrigin-RevId: 193247024
---
 .../contrib/autograph/converters/lists.py     | 34 +++++++++-----
 .../autograph/converters/lists_test.py        | 46 ++++++++++++++++++-
 .../pyct/static_analysis/type_info.py         |  2 +-
 3 files changed, 68 insertions(+), 14 deletions(-)

diff --git a/tensorflow/contrib/autograph/converters/lists.py b/tensorflow/contrib/autograph/converters/lists.py
index 6dda554acc..b49521b2c3 100644
--- a/tensorflow/contrib/autograph/converters/lists.py
+++ b/tensorflow/contrib/autograph/converters/lists.py
@@ -82,23 +82,33 @@ class ListTransformer(transformer.Base):
             element=call_node.args[0])
     return node
 
+  def _replace_list_constructors(self, targets, values):
+    for target in targets:
+      if (isinstance(target, (gast.Tuple, gast.List)) and
+          isinstance(values, (gast.Tuple, gast.List))):
+        n_targets = len(target.elts)
+        for i in range(n_targets):
+          target_el, value_el = target.elts[i], values.elts[i]
+          values.elts[i] = self._replace_list_constructors(
+              (target_el,), value_el)
+        return values
+      if isinstance(values, gast.List):
+        if values.elts:
+          return self._pre_populated_list(values)
+        else:
+          return self._empty_list(values)
+    return values
+
   def visit_Assign(self, node):
     node = self.generic_visit(node)
 
     # Only convert lists when they are assigned to a variable, e.g.:
     #   l = []
-    # TODO(mdan): This rule should be improved.
-    if len(node.targets) != 1:
-      return node
-    if not isinstance(node.value, gast.List):
-      return node
-    if not isinstance(node.value.ctx, gast.Load):
-      return node
-
-    if node.value.elts:
-      node.value = self._pre_populated_list(node.value)
-    else:
-      node.value = self._empty_list(node.value)
+    # TODO(mdan): A similar pattern exists in type_info.py
+    # We should add a generic "unpack_assignment" function to the base
+    # transformer, that has the same effect as applying some logic to the SSA
+    # form.
+    node.value = self._replace_list_constructors(node.targets, node.value)
     return node
 
 
diff --git a/tensorflow/contrib/autograph/converters/lists_test.py b/tensorflow/contrib/autograph/converters/lists_test.py
index 749ba14347..74c6dc64f1 100644
--- a/tensorflow/contrib/autograph/converters/lists_test.py
+++ b/tensorflow/contrib/autograph/converters/lists_test.py
@@ -45,7 +45,51 @@ class ListTest(converter_test_base.TestCase):
       result.utils = utils
       result.dtypes = dtypes
       with self.test_session() as sess:
-        self.assertEqual(test_fn(), sess.run(result.test_fn().stack()))
+        self.assertAllEqual([1], sess.run(result.test_fn().stack()))
+
+  def test_empty_annotated_lists_unpacked(self):
+
+    def test_fn():
+      l, m = [], []
+      utils.set_element_type(l, dtypes.int32)
+      utils.set_element_type(m, dtypes.int32)
+      l.append(1)
+      m.append(2)
+      return l, m
+
+    node = self.parse_and_analyze(test_fn, {'dtypes': dtypes, 'utils': utils})
+    node = lists.transform(node, self.ctx)
+
+    with self.compiled(node, tensor_array_ops.TensorArray,
+                       dtypes.int32) as result:
+      result.utils = utils
+      result.dtypes = dtypes
+      with self.test_session() as sess:
+        res_l, res_m = result.test_fn()
+        self.assertEqual([1], sess.run(res_l.stack()))
+        self.assertEqual([2], sess.run(res_m.stack()))
+
+  def test_empty_annotated_lists_list_unpacked(self):
+
+    def test_fn():
+      [l, m] = [], []
+      utils.set_element_type(l, dtypes.int32)
+      utils.set_element_type(m, dtypes.int32)
+      l.append(1)
+      m.append(2)
+      return l, m
+
+    node = self.parse_and_analyze(test_fn, {'dtypes': dtypes, 'utils': utils})
+    node = lists.transform(node, self.ctx)
+
+    with self.compiled(node, tensor_array_ops.TensorArray,
+                       dtypes.int32) as result:
+      result.utils = utils
+      result.dtypes = dtypes
+      with self.test_session() as sess:
+        res_l, res_m = result.test_fn()
+        self.assertEqual([1], sess.run(res_l.stack()))
+        self.assertEqual([2], sess.run(res_m.stack()))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
index 2f553e1e23..763997968c 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
@@ -184,7 +184,7 @@ class TypeInfoResolver(transformer.Base):
     # Multiple targets mean multiple assignment.
     for target in targets:
       # Tuple target means unpacking.
-      if isinstance(target, gast.Tuple):
+      if isinstance(target, (gast.Tuple, gast.List)):
         for i, target_item in enumerate(target.elts):
           # Two cases here:
           #   1. Static unpacking, e.g. a, b = c, d
-- 
GitLab


From 1628d18d24400f08b768b545f839e32b44a097c7 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Tue, 17 Apr 2018 13:31:54 -0700
Subject: [PATCH 039/450] Use is_constructible instead of is_convertible.

Before this, all objects would follow the slow path (since nothing is
convertible to AlphaNum since it has a private copy constructor).

Before:
entry {
  name: "MicroBenchmarks.benchmark_defun_matmul_2_by_2_CPU"
  iters: 30000
  wall_time: 67.5895690918
  extras {
    key: "examples_per_sec"
    value {
      double_value: 14795.1823549
    }
  }
}

After:
entry {
  name: "MicroBenchmarks.benchmark_defun_matmul_2_by_2_CPU"
  iters: 30000
  wall_time: 61.0044002533
  extras {
    key: "examples_per_sec"
    value {
      double_value: 16392.2601623
    }
  }
}
PiperOrigin-RevId: 193247183
---
 tensorflow/core/lib/core/errors.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/lib/core/errors.h b/tensorflow/core/lib/core/errors.h
index 1a0f4be2ea..51c09032df 100644
--- a/tensorflow/core/lib/core/errors.h
+++ b/tensorflow/core/lib/core/errors.h
@@ -42,7 +42,7 @@ namespace internal {
 // Eventually absl::strings will have native support for this and we will be
 // able to completely remove PrepareForStrCat().
 template <typename T>
-typename std::enable_if<!std::is_convertible<T, strings::AlphaNum>::value,
+typename std::enable_if<!std::is_constructible<strings::AlphaNum, T>::value,
                         string>::type
 PrepareForStrCat(const T& t) {
   std::stringstream ss;
-- 
GitLab


From fabf01011654be16e3aeb08192caa76c9595cfde Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 13:36:24 -0700
Subject: [PATCH 040/450] Make GroupRec* const in GroupRecCallback by marking
 mu mutable in CollectiveParamResolverLocal::GroupRec.

PiperOrigin-RevId: 193247799
---
 .../collective_param_resolver_local.cc         | 17 +++++++++--------
 .../collective_param_resolver_local.h          | 18 ++++++++++--------
 2 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index b34950b2f4..393d3f824d 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -401,7 +401,7 @@ void CollectiveParamResolverLocal::SetDefaultRank(const string& device,
 }
 
 Status CollectiveParamResolverLocal::InitInstanceSharedParams(
-    GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir) {
+    const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir) {
   VLOG(1) << "InitInstanceSharedParams " << ir;
   ir->shared.instance = cp->instance;
   {
@@ -443,7 +443,7 @@ Status CollectiveParamResolverLocal::InitInstanceSharedParams(
 }
 
 void CollectiveParamResolverLocal::CompleteDefaultRanking(
-    GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir,
+    const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir,
     const std::vector<DeviceLocality>& localities) {
   // Establish an instance-specific default rank order for devices
   // based on localities.  This rank order should be a good ring
@@ -485,7 +485,7 @@ void CollectiveParamResolverLocal::CallbackWithStatus(
 }
 
 void CollectiveParamResolverLocal::FindInstanceRec(
-    GroupRec* gr, CollectiveParams* cp, const InstanceRecCallback& done) {
+    const GroupRec* gr, CollectiveParams* cp, const InstanceRecCallback& done) {
   InstanceRec* irec = nullptr;
   bool exit_outside_locks = false;
   {
@@ -544,7 +544,8 @@ void CollectiveParamResolverLocal::CompleteParamsAsync(
   VLOG(1) << "CompleteParams " << device << " for " << cp << ": "
           << cp->ToString();
   CompleteGroupLocal(
-      device, cp, [this, device, cp, done](const Status& s, GroupRec* gr) {
+      device, cp,
+      [this, device, cp, done](const Status& s, const GroupRec* gr) {
         if (s.ok()) {
           CompleteInstanceLocal(device, gr, cp, cp->is_source, done);
         } else {
@@ -563,8 +564,8 @@ void CollectiveParamResolverLocal::CompleteInstanceAsync(
 }
 
 void CollectiveParamResolverLocal::CompleteInstanceLocal(
-    const string& device, GroupRec* gr, CollectiveParams* cp, bool is_source,
-    const StatusCallback& done) {
+    const string& device, const GroupRec* gr, CollectiveParams* cp,
+    bool is_source, const StatusCallback& done) {
   VLOG(1) << "CompleteInstanceLocal " << device
           << " instance_key: " << cp->instance.instance_key << " gr " << gr;
 
@@ -589,8 +590,8 @@ void CollectiveParamResolverLocal::CompleteInstanceLocal(
 }
 
 void CollectiveParamResolverLocal::CompleteInstanceFromInitializedIRec(
-    const string& device, GroupRec* gr, CollectiveParams* cp, InstanceRec* ir,
-    bool is_source, const StatusCallback& done) {
+    const string& device, const GroupRec* gr, CollectiveParams* cp,
+    InstanceRec* ir, bool is_source, const StatusCallback& done) {
   // Populate the fields common across instance.
   {
     mutex_lock l(ir->out_mu);
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.h b/tensorflow/core/common_runtime/collective_param_resolver_local.h
index ff3415b0a9..7b2946e936 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.h
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h
@@ -56,7 +56,7 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   // Used to complete/verify CollGroup.
   struct GroupRec {
     CollGroupParams group;
-    mutex mu;
+    mutable mutex mu;
     Status status GUARDED_BY(mu);
     std::set<string> device_set GUARDED_BY(mu);
     std::vector<string> device_list GUARDED_BY(mu);
@@ -71,7 +71,8 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   // calling done.  Callback GroupRec* arg is only valid if status is ok.
   // Ownership of GroupRec stays with this object and does not pass to the
   // callback.
-  typedef std::function<void(const Status& s, GroupRec* gr)> GroupRecCallback;
+  typedef std::function<void(const Status& s, const GroupRec* gr)>
+      GroupRecCallback;
   void CompleteGroupLocal(const string& device, CollectiveParams* cp,
                           const GroupRecCallback& done)
       LOCKS_EXCLUDED(group_mu_);
@@ -135,7 +136,7 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   // with this object and does not pass to the callback.
   typedef std::function<void(const Status& s, InstanceRec* ir)>
       InstanceRecCallback;
-  void FindInstanceRec(GroupRec* gr, CollectiveParams* cp,
+  void FindInstanceRec(const GroupRec* gr, CollectiveParams* cp,
                        const InstanceRecCallback& done)
       LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
 
@@ -144,27 +145,28 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   //
   // Preconditions:
   //  cp is populated with all DeviceLocalities
-  Status InitInstanceSharedParams(GroupRec* gr, const CollectiveParams* cp,
-                                  InstanceRec* ir)
+  Status InitInstanceSharedParams(const GroupRec* gr,
+                                  const CollectiveParams* cp, InstanceRec* ir)
       EXCLUSIVE_LOCKS_REQUIRED(ir->out_mu) LOCKS_EXCLUDED(gr->mu);
 
   // Establishes the final order of ir->shared.instance.device_names and
   // ir->shared.instance.task_names by considering localities of all devices.
-  void CompleteDefaultRanking(GroupRec* gr, const CollectiveParams* cp,
+  void CompleteDefaultRanking(const GroupRec* gr, const CollectiveParams* cp,
                               InstanceRec* ir,
                               const std::vector<DeviceLocality>& localities)
       EXCLUSIVE_LOCKS_REQUIRED(ir->out_mu);
 
   // Finish populating *cp.
   // Precondition: *gr has been fully populated by CompleteGroupLocal.
-  void CompleteInstanceLocal(const string& device, GroupRec* gr,
+  void CompleteInstanceLocal(const string& device, const GroupRec* gr,
                              CollectiveParams* cp, bool is_source,
                              const StatusCallback& done)
       LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
 
   // Finish populating *cp from fully initialized *ir.
   // Precondition: *gr and *ir are fully populated.
-  void CompleteInstanceFromInitializedIRec(const string& device, GroupRec* gr,
+  void CompleteInstanceFromInitializedIRec(const string& device,
+                                           const GroupRec* gr,
                                            CollectiveParams* cp,
                                            InstanceRec* ir, bool is_source,
                                            const StatusCallback& done)
-- 
GitLab


From 72df3d60faa8bbf42bb3f5c7ed38887215fad037 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 13:36:46 -0700
Subject: [PATCH 041/450] [XLA] Redesign: support xla::XlaComputation in
 compile-only client and service.

PiperOrigin-RevId: 193247845
---
 tensorflow/compiler/xla/client/BUILD          |  1 +
 .../xla/client/compile_only_client.cc         | 18 +++++++++++++
 .../compiler/xla/client/compile_only_client.h | 22 +++++++++++++++
 .../xla/service/compile_only_service.cc       | 27 +++++++++++++++++++
 .../xla/service/compile_only_service.h        | 19 +++++++++++++
 5 files changed, 87 insertions(+)

diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index a299c2afd4..286d06d12f 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -130,6 +130,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:compile_only_service",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/core:stream_executor_no_cuda",
diff --git a/tensorflow/compiler/xla/client/compile_only_client.cc b/tensorflow/compiler/xla/client/compile_only_client.cc
index 59662c95ac..96e38bca01 100644
--- a/tensorflow/compiler/xla/client/compile_only_client.cc
+++ b/tensorflow/compiler/xla/client/compile_only_client.cc
@@ -39,6 +39,24 @@ CompileOnlyClient::CompileAheadOfTime(
   return compiler_service_->CompileAheadOfTime(service_instances, options);
 }
 
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+CompileOnlyClient::CompileAheadOfTime(
+    const tensorflow::gtl::ArraySlice<AotXlaComputationInstance> computations,
+    const AotCompilationOptions& options) {
+  std::vector<CompileOnlyService::AotXlaComputationInstance> service_instances;
+  service_instances.reserve(computations.size());
+  for (const AotXlaComputationInstance& instance : computations) {
+    service_instances.emplace_back();
+    CompileOnlyService::AotXlaComputationInstance& service_instance =
+        service_instances.back();
+    TF_RET_CHECK(instance.computation != nullptr);
+    service_instance.computation = instance.computation->proto();
+    service_instance.argument_layouts = instance.argument_layouts;
+    service_instance.result_layout = instance.result_layout;
+  }
+  return compiler_service_->CompileAheadOfTime(service_instances, options);
+}
+
 int64 CompileOnlyClient::PointerSizeForTriple(tensorflow::StringPiece triple) {
   llvm::Triple llvm_triple(
       llvm::Triple::normalize(llvm::StringRef(triple.data(), triple.size())));
diff --git a/tensorflow/compiler/xla/client/compile_only_client.h b/tensorflow/compiler/xla/client/compile_only_client.h
index 5900048711..c8725b8517 100644
--- a/tensorflow/compiler/xla/client/compile_only_client.h
+++ b/tensorflow/compiler/xla/client/compile_only_client.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/compile_only_service.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -54,6 +55,27 @@ class CompileOnlyClient : public Client {
       const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
       const AotCompilationOptions& options);
 
+  // A description of an xla computation to compile using CompileAheadOfTime.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  struct AotXlaComputationInstance {
+    const XlaComputation* computation;
+    // Inform the compiler of the expected layout for arguments.
+    std::vector<const Shape*> argument_layouts;
+    // Specifies the expected result layout.
+    const Shape* result_layout;
+  };
+
+  // Compiles a list of xla computations for ahead-of-time execution.  This is
+  // intended for use in static compilation. The |options| parameter describes
+  // the target for which the compiler should emit code.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      const tensorflow::gtl::ArraySlice<AotXlaComputationInstance> computations,
+      const AotCompilationOptions& options);
+
   // Returns the size of a pointer in bytes for a given triple.
   static int64 PointerSizeForTriple(tensorflow::StringPiece triple);
 
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index c83da9eddc..fb70ea5315 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -61,6 +61,33 @@ CompileOnlyService::CompileOnlyService(const ServiceOptions& options,
                                        Compiler* compiler)
     : Service(options, /*execute_backend=*/nullptr), compiler_(compiler) {}
 
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+CompileOnlyService::CompileAheadOfTime(
+    const tensorflow::gtl::ArraySlice<AotXlaComputationInstance> computations,
+    const AotCompilationOptions& options) {
+  std::vector<std::unique_ptr<HloModule>> hlo_modules;
+  for (const AotXlaComputationInstance& instance : computations) {
+    TF_RET_CHECK(instance.computation.has_program_shape());
+
+    const DebugOptions& debug_options = options.debug_options();
+    const auto& program_shape = instance.computation.program_shape();
+    ExecutionOptions execution_options;
+    *execution_options.mutable_debug_options() = debug_options;
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloModuleConfig> module_config,
+        CreateModuleConfig(program_shape, instance.argument_layouts,
+                           &execution_options));
+
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloModule> hlo_module,
+        HloModule::CreateFromProto(instance.computation, *module_config));
+    TF_RETURN_IF_ERROR(MaybeDumpHloModule(*hlo_module));
+    hlo_modules.push_back(std::move(hlo_module));
+  }
+
+  return compiler_->CompileAheadOfTime(std::move(hlo_modules), options);
+}
+
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 CompileOnlyService::CompileAheadOfTime(
     const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
diff --git a/tensorflow/compiler/xla/service/compile_only_service.h b/tensorflow/compiler/xla/service/compile_only_service.h
index 9859941c6c..dd8de42a0f 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.h
+++ b/tensorflow/compiler/xla/service/compile_only_service.h
@@ -53,6 +53,25 @@ class CompileOnlyService : public Service {
       const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
       const AotCompilationOptions& Options);
 
+  // A description of a xla computation to compile using CompileAheadOfTime.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  struct AotXlaComputationInstance {
+    HloModuleProto computation;
+    std::vector<const Shape*> argument_layouts;
+    const Shape* result_layout = nullptr;
+  };
+
+  // Compiles a list of xla computations for ahead-of-time execution.  This is
+  // intended for use in static compilation.  See
+  // |CompileOnlyClient::CompileAheadOfTime| for additional details.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      const tensorflow::gtl::ArraySlice<AotXlaComputationInstance> computations,
+      const AotCompilationOptions& options);
+
   // Override Service methods that require or imply the existence of an
   // execute backend.  Note that this does not include TransferToClient, as
   // computing constants produces global data that we may wish to transfer.
-- 
GitLab


From 4764bf2986e2779d5c80b5aca08d72d5c878818b Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Tue, 17 Apr 2018 14:26:16 -0700
Subject: [PATCH 042/450] [StreamExecutor] Rename ::perftools::gputools ->
 ::stream_executor, part 1.

Step 1 of re-namespace'ing StreamExecutor into ::stream_executor.

This moves everything inside of stream_executor/..., and leaves a
namespace alias into ::perftools::gputools.  The next steps will clean
up users to use the new namespace.

This is mostly a mechanical change, but it also includes a bunch of
non-mechanical changes that ideally would be split out into separate
patches.  Unfortunately they all sort of need to be shoved in here for
various reasons:

 - forward declarations need to be in the same namespace as the actual
   types, so we need to change all forward declarations of
   StreamExecutor types in this one patch.

 - Uses of these forward declarations need to be changed to the new
   namespace (or otherwise we need to add a namespace alias to the
   relevant header, but this is pretty ugly).

 - Various initialization code needs to live in StreamExecutor's "real"
   namespace, so all this needs to be changed.

PiperOrigin-RevId: 193256128
---
 .../compiler/xla/executable_run_options.cc    |   4 +-
 .../compiler/xla/executable_run_options.h     |  24 +++--
 tensorflow/compiler/xla/types.h               |   8 ++
 .../core/common_runtime/gpu/gpu_event_mgr.h   |   6 +-
 .../core/common_runtime/gpu/gpu_id_utils.h    |   5 +-
 tensorflow/core/common_runtime/gpu/gpu_init.h |   8 +-
 .../core/common_runtime/gpu/gpu_util.cc       |   4 +-
 tensorflow/core/common_runtime/gpu/gpu_util.h |   4 +-
 .../core/common_runtime/gpu_device_context.h  |  37 ++++---
 tensorflow/core/framework/device_base.h       |  15 ++-
 .../default/from_stream_executor_status.h     |   2 -
 tensorflow/core/platform/stream_executor.h    |  11 ++
 tensorflow/core/platform/types.h              |   8 ++
 tensorflow/stream_executor/blas.cc            |   6 +-
 tensorflow/stream_executor/blas.h             |   6 +-
 .../stream_executor/cuda/cuda_activation.cc   |   6 +-
 .../stream_executor/cuda/cuda_activation.h    |   6 +-
 tensorflow/stream_executor/cuda/cuda_blas.cc  |  94 ++++++++--------
 tensorflow/stream_executor/cuda/cuda_blas.h   |   6 +-
 .../stream_executor/cuda/cuda_diagnostics.cc  |   6 +-
 .../stream_executor/cuda/cuda_diagnostics.h   |   6 +-
 tensorflow/stream_executor/cuda/cuda_dnn.cc   |  82 +++++++-------
 tensorflow/stream_executor/cuda/cuda_dnn.h    |   6 +-
 .../stream_executor/cuda/cuda_driver.cc       |   6 +-
 tensorflow/stream_executor/cuda/cuda_driver.h |   6 +-
 tensorflow/stream_executor/cuda/cuda_event.cc |   6 +-
 tensorflow/stream_executor/cuda/cuda_event.h  |   6 +-
 tensorflow/stream_executor/cuda/cuda_fft.cc   | 102 +++++++++---------
 tensorflow/stream_executor/cuda/cuda_fft.h    |   6 +-
 .../stream_executor/cuda/cuda_gpu_executor.cc |  18 ++--
 .../stream_executor/cuda/cuda_gpu_executor.h  |   6 +-
 .../stream_executor/cuda/cuda_helpers.h       |   6 +-
 tensorflow/stream_executor/cuda/cuda_kernel.h |   6 +-
 .../stream_executor/cuda/cuda_platform.cc     |  18 ++--
 .../stream_executor/cuda/cuda_platform.h      |   6 +-
 .../stream_executor/cuda/cuda_platform_id.cc  |   6 +-
 .../stream_executor/cuda/cuda_platform_id.h   |   6 +-
 tensorflow/stream_executor/cuda/cuda_rng.cc   |  87 ++++++++-------
 tensorflow/stream_executor/cuda/cuda_rng.h    |   6 +-
 .../stream_executor/cuda/cuda_stream.cc       |   6 +-
 tensorflow/stream_executor/cuda/cuda_stream.h |   6 +-
 tensorflow/stream_executor/cuda/cuda_timer.cc |   6 +-
 tensorflow/stream_executor/cuda/cuda_timer.h  |  10 +-
 .../stream_executor/cuda/cudnn_version.cc     |   6 +-
 .../stream_executor/cuda/cudnn_version.h      |   6 +-
 .../cuda/cudnn_version_test.cc                |   6 +-
 .../stream_executor/device_description.cc     |   7 +-
 .../stream_executor/device_description.h      |   6 +-
 tensorflow/stream_executor/device_memory.h    |  13 ++-
 tensorflow/stream_executor/device_options.h   |   6 +-
 tensorflow/stream_executor/dnn.cc             |   6 +-
 tensorflow/stream_executor/dnn.h              |   6 +-
 tensorflow/stream_executor/dso_loader.cc      |   6 +-
 tensorflow/stream_executor/dso_loader.h       |   6 +-
 tensorflow/stream_executor/event.cc           |   6 +-
 tensorflow/stream_executor/event.h            |   6 +-
 tensorflow/stream_executor/executor_cache.cc  |   6 +-
 tensorflow/stream_executor/executor_cache.h   |   6 +-
 tensorflow/stream_executor/fft.h              |   8 +-
 .../stream_executor/host/host_gpu_executor.cc |   6 +-
 .../stream_executor/host/host_gpu_executor.h  |   6 +-
 .../stream_executor/host/host_platform.cc     |  16 ++-
 .../stream_executor/host/host_platform.h      |   6 +-
 .../stream_executor/host/host_platform_id.cc  |   6 +-
 .../stream_executor/host/host_platform_id.h   |   6 +-
 .../stream_executor/host/host_stream.cc       |   6 +-
 tensorflow/stream_executor/host/host_stream.h |   6 +-
 tensorflow/stream_executor/host/host_timer.cc |   6 +-
 tensorflow/stream_executor/host/host_timer.h  |   6 +-
 tensorflow/stream_executor/host_buffer.h      |   6 +-
 tensorflow/stream_executor/kernel.cc          |   6 +-
 tensorflow/stream_executor/kernel.h           |  12 +--
 .../stream_executor/kernel_cache_config.h     |   6 +-
 tensorflow/stream_executor/kernel_spec.cc     |   7 +-
 tensorflow/stream_executor/kernel_spec.h      |   6 +-
 tensorflow/stream_executor/launch_dim.h       |   8 +-
 tensorflow/stream_executor/lib/array_slice.h  |  13 ++-
 tensorflow/stream_executor/lib/casts.h        |   8 +-
 tensorflow/stream_executor/lib/demangle.cc    |   6 +-
 tensorflow/stream_executor/lib/demangle.h     |   6 +-
 tensorflow/stream_executor/lib/env.h          |   6 +-
 tensorflow/stream_executor/lib/error.h        |   8 +-
 .../stream_executor/lib/human_readable.h      |   6 +-
 tensorflow/stream_executor/lib/initialize.h   |  17 ++-
 .../stream_executor/lib/inlined_vector.h      |   6 +-
 tensorflow/stream_executor/lib/mathutil.h     |   6 +-
 tensorflow/stream_executor/lib/notification.h |   6 +-
 tensorflow/stream_executor/lib/numbers.cc     |   6 +-
 tensorflow/stream_executor/lib/numbers.h      |   6 +-
 tensorflow/stream_executor/lib/path.cc        |   6 +-
 tensorflow/stream_executor/lib/path.h         |   6 +-
 .../stream_executor/lib/process_state.cc      |   6 +-
 .../stream_executor/lib/process_state.h       |   6 +-
 tensorflow/stream_executor/lib/ptr_util.h     |  14 ++-
 tensorflow/stream_executor/lib/stacktrace.h   |   6 +-
 tensorflow/stream_executor/lib/status.h       |  17 ++-
 tensorflow/stream_executor/lib/statusor.h     |   8 +-
 tensorflow/stream_executor/lib/str_util.h     |   6 +-
 tensorflow/stream_executor/lib/strcat.h       |   8 +-
 tensorflow/stream_executor/lib/stringpiece.h  |   6 +-
 tensorflow/stream_executor/lib/stringprintf.h |   6 +-
 .../stream_executor/lib/thread_options.h      |   6 +-
 tensorflow/stream_executor/lib/threadpool.h   |   6 +-
 .../stream_executor/multi_platform_manager.cc |   6 +-
 .../stream_executor/multi_platform_manager.h  |  18 ++--
 tensorflow/stream_executor/platform.cc        |   6 +-
 tensorflow/stream_executor/platform.h         |   8 +-
 .../stream_executor/platform/default/mutex.h  |   6 +-
 tensorflow/stream_executor/platform/port.h    |   8 +-
 tensorflow/stream_executor/plugin.cc          |   6 +-
 tensorflow/stream_executor/plugin.h           |   6 +-
 tensorflow/stream_executor/plugin_registry.cc |   6 +-
 tensorflow/stream_executor/plugin_registry.h  |   6 +-
 tensorflow/stream_executor/rng.cc             |   6 +-
 tensorflow/stream_executor/rng.h              |   6 +-
 .../stream_executor/scratch_allocator.cc      |   6 +-
 .../stream_executor/scratch_allocator.h       |   6 +-
 .../stream_executor/shared_memory_config.h    |   6 +-
 tensorflow/stream_executor/stream.cc          |   6 +-
 tensorflow/stream_executor/stream.h           |   6 +-
 tensorflow/stream_executor/stream_executor.h  |  11 ++
 .../stream_executor_internal.cc               |   6 +-
 .../stream_executor_internal.h                |   6 +-
 .../stream_executor/stream_executor_pimpl.cc  |   6 +-
 .../stream_executor/stream_executor_pimpl.h   |   8 +-
 .../temporary_device_memory.cc                |   6 +-
 .../stream_executor/temporary_device_memory.h |   6 +-
 .../temporary_memory_manager.cc               |   6 +-
 .../temporary_memory_manager.h                |   6 +-
 tensorflow/stream_executor/timer.cc           |   6 +-
 tensorflow/stream_executor/timer.h            |   6 +-
 tensorflow/stream_executor/trace_listener.h   |   6 +-
 132 files changed, 572 insertions(+), 744 deletions(-)

diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index 1700c97718..99b8f0558e 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -36,12 +36,12 @@ DeviceMemoryAllocator* ExecutableRunOptions::allocator() const {
 }
 
 ExecutableRunOptions& ExecutableRunOptions::set_stream(
-    perftools::gputools::Stream* stream) {
+    stream_executor::Stream* stream) {
   stream_ = stream;
   return *this;
 }
 
-perftools::gputools::Stream* ExecutableRunOptions::stream() const {
+stream_executor::Stream* ExecutableRunOptions::stream() const {
   return stream_;
 }
 
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index 2c1d9ffff1..1a095a82cc 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -16,29 +16,31 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 #define TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 
-// Intentionally forward declared so that ExecutableRunOptions can be linked
+// These classes are forward declared so that ExecutableRunOptions can be linked
 // into an XLA-compiled binary without having to link all of the pointed-to
 // objects (e.g., for an ahead-of-time compiled CPU binary, the gpu tools don't
 // need to be linked).
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 class Stream;
 class Platform;
-}
-}
+}  // namespace stream_executor
 
 namespace tensorflow {
 namespace thread {
 class ThreadPool;
-}
-}
+}  // namespace thread
+}  // namespace tensorflow
 
 namespace Eigen {
 struct ThreadPoolDevice;
-}
+}  // namespace Eigen
 
 namespace xla {
 
+// TODO(b/77980417): Once the perftools::gputools -> stream_executor migration
+// is complete, add "using namespace se = stream_executor" here and
+// s/stream_executor/se::/ to match our idiom elsewhere.
+
 class DeviceMemoryAllocator;
 class DeviceAssignment;
 class ExecutionProfile;
@@ -61,8 +63,8 @@ class ExecutableRunOptions {
   // If set, this is the stream to run the computation on. The platform of the
   // stream must match the platform the executable was built for.  A value of
   // nullptr indicates the option has not been set.
-  ExecutableRunOptions& set_stream(perftools::gputools::Stream* stream);
-  perftools::gputools::Stream* stream() const;
+  ExecutableRunOptions& set_stream(stream_executor::Stream* stream);
+  stream_executor::Stream* stream() const;
 
   // Sets the thread pool on which to run parallel CPU backend
   // computations. Does not take ownership.
@@ -91,7 +93,7 @@ class ExecutableRunOptions {
   DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
   DeviceAssignment* device_assignment_ = nullptr;
-  perftools::gputools::Stream* stream_ = nullptr;
+  stream_executor::Stream* stream_ = nullptr;
   tensorflow::thread::ThreadPool* inter_op_thread_pool_ = nullptr;
   const Eigen::ThreadPoolDevice* intra_op_thread_pool_ = nullptr;
   ExecutionProfile* execution_profile_ = nullptr;
diff --git a/tensorflow/compiler/xla/types.h b/tensorflow/compiler/xla/types.h
index 9fa4297523..20f3f1b957 100644
--- a/tensorflow/compiler/xla/types.h
+++ b/tensorflow/compiler/xla/types.h
@@ -46,4 +46,12 @@ using ::Eigen::half;
 
 }  // namespace xla
 
+// Alias namespace ::stream_executor as ::xla::se.
+namespace stream_executor {}
+namespace xla {
+// TODO(b/77980417): Uncomment this once all namespace aliases named 'se' are
+// removed in ::xla.
+// namespace se = ::stream_executor;
+}  // namespace xla
+
 #endif  // TENSORFLOW_COMPILER_XLA_TYPES_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
index d23898e1f2..fd5f50ca4e 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
@@ -29,13 +29,11 @@ limitations under the License.
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 class Event;
 class Stream;
 class StreamExecutor;
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
index 2e90687fe8..5c503d1261 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
@@ -23,7 +23,10 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 
 namespace tensorflow {
-namespace gpu = ::perftools::gputools;
+
+// TODO(b/77980417): Remove this and use the regular tensorflow::se alias once
+// that's available.
+namespace gpu = ::stream_executor;
 
 // Utility methods for translation between Tensorflow GPU ids and CUDA GPU ids.
 class GpuIdUtil {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.h b/tensorflow/core/common_runtime/gpu/gpu_init.h
index 927d05d5ba..bfd7a77f83 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_init.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_init.h
@@ -18,11 +18,9 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/status.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 class Platform;
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 namespace tensorflow {
 
@@ -34,7 +32,7 @@ Status ValidateGPUMachineManager();
 // initializing the GPUs on the machine if needed the first time it is
 // called.  Must only be called when there is a valid GPU environment
 // in the process (e.g., ValidateGPUMachineManager() returns OK).
-perftools::gputools::Platform* GPUMachineManager();
+stream_executor::Platform* GPUMachineManager();
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index a0f5877d62..5214ceaae5 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -60,7 +60,9 @@ using perftools::gputools::Stream;
 
 namespace tensorflow {
 
-namespace gpu = ::perftools::gputools;
+// TODO(b/77980417): Remove this and use the regular tensorflow::se alias once
+// that's available.
+namespace gpu = ::stream_executor;
 
 Status PrepareCopy(Device* device, const DeviceContext* ctx, const Tensor& src,
                    const Tensor* dst,
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.h b/tensorflow/core/common_runtime/gpu/gpu_util.h
index d99a0b1f61..337dc89895 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.h
@@ -27,7 +27,9 @@ namespace tensorflow {
 class RecvTensorResponse;
 class TensorProto;
 
-namespace gpu = ::perftools::gputools;
+// TODO(b/77980417): Remove this and use the regular tensorflow::se alias once
+// that's available.
+namespace gpu = ::stream_executor;
 
 class GPUUtil {
  public:
diff --git a/tensorflow/core/common_runtime/gpu_device_context.h b/tensorflow/core/common_runtime/gpu_device_context.h
index 8b1430f021..38a18cd087 100644
--- a/tensorflow/core/common_runtime/gpu_device_context.h
+++ b/tensorflow/core/common_runtime/gpu_device_context.h
@@ -19,23 +19,22 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/device_base.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 class Stream;
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 namespace tensorflow {
 
-namespace gpu = ::perftools::gputools;
+// TODO(b/77980417): Replace stream_executor:: with se:: once our namespace
+// migration is complete and the alias is available.
 
 class GPUDeviceContext : public DeviceContext {
  public:
   // Does not take ownership of streams.
-  GPUDeviceContext(int stream_id, gpu::Stream* stream,
-                   gpu::Stream* host_to_device_stream,
-                   gpu::Stream* device_to_host_stream,
-                   gpu::Stream* device_to_device_stream)
+  GPUDeviceContext(int stream_id, stream_executor::Stream* stream,
+                   stream_executor::Stream* host_to_device_stream,
+                   stream_executor::Stream* device_to_host_stream,
+                   stream_executor::Stream* device_to_device_stream)
       : stream_id_(stream_id),
         stream_(stream),
         host_to_device_stream_(host_to_device_stream),
@@ -44,10 +43,14 @@ class GPUDeviceContext : public DeviceContext {
 
   ~GPUDeviceContext() override {}
 
-  gpu::Stream* stream() const override { return stream_; }
-  gpu::Stream* host_to_device_stream() const { return host_to_device_stream_; }
-  gpu::Stream* device_to_host_stream() const { return device_to_host_stream_; }
-  gpu::Stream* device_to_device_stream() const {
+  stream_executor::Stream* stream() const override { return stream_; }
+  stream_executor::Stream* host_to_device_stream() const {
+    return host_to_device_stream_;
+  }
+  stream_executor::Stream* device_to_host_stream() const {
+    return device_to_host_stream_;
+  }
+  stream_executor::Stream* device_to_device_stream() const {
     return device_to_device_stream_;
   }
   int stream_id() const { return stream_id_; }
@@ -67,13 +70,13 @@ class GPUDeviceContext : public DeviceContext {
   int stream_id_;
   // The default primary stream to use for this context.
   // All the memory belongs to this stream.
-  gpu::Stream* stream_;
+  stream_executor::Stream* stream_;
   // The stream to use for copy data from host into GPU.
-  gpu::Stream* host_to_device_stream_;
+  stream_executor::Stream* host_to_device_stream_;
   // The stream to use for copy data from GPU to host.
-  gpu::Stream* device_to_host_stream_;
+  stream_executor::Stream* device_to_host_stream_;
   // The stream to use for copy data between GPU.
-  gpu::Stream* device_to_device_stream_;
+  stream_executor::Stream* device_to_device_stream_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 8473b228d3..223b74857d 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -34,11 +34,9 @@ struct SyclDevice;
 #endif
 }  // end namespace Eigen
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 class Stream;
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 namespace tensorflow {
 
@@ -69,9 +67,10 @@ class PerOpGpuDevice {
 class DeviceContext : public core::RefCounted {
  public:
   ~DeviceContext() override {}
-  virtual perftools::gputools::Stream* stream() const { return nullptr; }
-  virtual void MaintainLifetimeOnStream(
-      const Tensor* t, perftools::gputools::Stream* stream) const {}
+  virtual stream_executor::Stream* stream() const { return nullptr; }
+  virtual void MaintainLifetimeOnStream(const Tensor* t,
+                                        stream_executor::Stream* stream) const {
+  }
 
   // "cpu_tensor" is a tensor on a CPU. Copies "cpu_tensor" into
   // "device_tensor" which is on a GPU device "device". "device_tensor"
@@ -133,7 +132,7 @@ class DeviceBase {
   // but also by TPU devices (to provide default device context).
   struct GpuDeviceInfo {
     // Make sure all the defaults are NULL, so we can spot missing assignments.
-    perftools::gputools::Stream* stream = nullptr;
+    stream_executor::Stream* stream = nullptr;
     DeviceContext* default_context = nullptr;
     EventMgr* event_mgr = nullptr;
     int gpu_id = -1;
diff --git a/tensorflow/core/platform/default/from_stream_executor_status.h b/tensorflow/core/platform/default/from_stream_executor_status.h
index 2a2297a657..36a67a3648 100644
--- a/tensorflow/core/platform/default/from_stream_executor_status.h
+++ b/tensorflow/core/platform/default/from_stream_executor_status.h
@@ -23,8 +23,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace gpu = ::perftools::gputools;
-
 // On the open-source platform, stream_executor currently uses
 // tensorflow::Status
 inline Status FromStreamExecutorStatus(
diff --git a/tensorflow/core/platform/stream_executor.h b/tensorflow/core/platform/stream_executor.h
index f31e556a70..006184ddef 100644
--- a/tensorflow/core/platform/stream_executor.h
+++ b/tensorflow/core/platform/stream_executor.h
@@ -37,4 +37,15 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
+namespace perftools {
+namespace gputools {
+
+// Temporarily pull stream_executor into perftools::gputools while we migrate
+// code to the new namespace.  TODO(jlebar): Remove this once we've completed
+// the migration.
+using namespace stream_executor;  // NOLINT[build/namespaces]
+
+}  // namespace gputools
+}  // namespace perftools
+
 #endif  // TENSORFLOW_PLATFORM_STREAM_EXECUTOR_H_
diff --git a/tensorflow/core/platform/types.h b/tensorflow/core/platform/types.h
index 6308e58847..f2471712cc 100644
--- a/tensorflow/core/platform/types.h
+++ b/tensorflow/core/platform/types.h
@@ -60,4 +60,12 @@ typedef uint64 Fprint;
 
 }  // namespace tensorflow
 
+// Alias namespace ::stream_executor as ::tensorflow::se.
+namespace stream_executor {}
+namespace tensorflow {
+// TODO(b/77980417): Uncomment this once all namespace aliases named 'se' are
+// removed in ::xla.
+// namespace se = ::stream_executor;
+}  // namespace tensorflow
+
 #endif  // TENSORFLOW_PLATFORM_TYPES_H_
diff --git a/tensorflow/stream_executor/blas.cc b/tensorflow/stream_executor/blas.cc
index 31724cf6c9..906d6fb702 100644
--- a/tensorflow/stream_executor/blas.cc
+++ b/tensorflow/stream_executor/blas.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/lib/strcat.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace blas {
 
 string TransposeString(Transpose t) {
@@ -95,5 +94,4 @@ std::ostream& operator<<(std::ostream& os, ComputationType ty) {
 }
 
 }  // namespace blas
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h
index c5f778a5c7..6e62b85728 100644
--- a/tensorflow/stream_executor/blas.h
+++ b/tensorflow/stream_executor/blas.h
@@ -49,8 +49,7 @@ namespace Eigen {
 struct half;
 }  // namespace Eigen
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 class ScratchAllocator;
@@ -2100,7 +2099,6 @@ class BlasSupport {
                   DeviceMemory<std::complex<double>> *b, int ldb) override;
 
 }  // namespace blas
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_BLAS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_activation.cc b/tensorflow/stream_executor/cuda/cuda_activation.cc
index 5f4cf9dbd7..cf6b9e2c6e 100644
--- a/tensorflow/stream_executor/cuda/cuda_activation.cc
+++ b/tensorflow/stream_executor/cuda/cuda_activation.cc
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 CudaContext* ExtractCudaContext(CUDAExecutor *cuda_exec);
@@ -40,5 +39,4 @@ ScopedActivateExecutorContext::~ScopedActivateExecutorContext() {
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_activation.h b/tensorflow/stream_executor/cuda/cuda_activation.h
index c9d43a9766..04ffaef364 100644
--- a/tensorflow/stream_executor/cuda/cuda_activation.h
+++ b/tensorflow/stream_executor/cuda/cuda_activation.h
@@ -25,8 +25,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class StreamExecutor;
 
@@ -56,7 +55,6 @@ class ScopedActivateExecutorContext {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 1c550dbb13..007c0f1c86 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -75,15 +75,14 @@ limitations under the License.
 #include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuBlasPlugin);
 
 namespace wrap {
 
-#define PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(__name)                      \
+#define STREAM_EXECUTOR_CUBLAS_WRAP(__name)                         \
   struct WrapperShim__##__name {                                    \
     static const char *kName;                                       \
     template <typename... Args>                                     \
@@ -94,8 +93,8 @@ namespace wrap {
   } __name;                                                         \
   const char *WrapperShim__##__name::kName = #__name;
 
-#define PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(__name) \
-  PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(__name)
+#define STREAM_EXECUTOR_CUBLAS_V2_WRAP(__name) \
+  STREAM_EXECUTOR_CUBLAS_WRAP(__name)
 
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
   __macro(cublasSnrm2)                    \
@@ -269,28 +268,28 @@ namespace wrap {
   __macro(cublasCdgmm)                    \
   __macro(cublasZdgmm)
 
-PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasCreate)
-PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasDestroy)
-PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasSetStream)
-PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasSetPointerMode)
-PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasGetPointerMode)
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasSgemmBatched)
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasDgemmBatched)
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasCgemmBatched)
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasZgemmBatched)
-CUBLAS_BLAS_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP)
+STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasCreate)
+STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasDestroy)
+STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasSetStream)
+STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasSetPointerMode)
+STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasGetPointerMode)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasSgemmBatched)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasDgemmBatched)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasCgemmBatched)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasZgemmBatched)
+CUBLAS_BLAS_ROUTINE_EACH(STREAM_EXECUTOR_CUBLAS_V2_WRAP)
 
 #if CUDA_VERSION >= 7050
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasSgemmEx)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasSgemmEx)
 #endif
 
 #if CUDA_VERSION >= 8000
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasGemmEx)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasGemmEx)
 #endif
 
 #if CUDA_VERSION >= 9000
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasGetMathMode)
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasSetMathMode)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasGetMathMode)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasSetMathMode)
 #endif
 
 }  // namespace wrap
@@ -2803,46 +2802,39 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
 
 }  // namespace cuda
 
-namespace gpu = ::perftools::gputools;
-
 void initialize_cublas() {
-  gpu::port::Status status =
-      gpu::PluginRegistry::Instance()
-          ->RegisterFactory<gpu::PluginRegistry::BlasFactory>(
-              gpu::cuda::kCudaPlatformId, gpu::cuda::kCuBlasPlugin, "cuBLAS",
-              [](gpu::internal::StreamExecutorInterface
-                     *parent) -> gpu::blas::BlasSupport * {
-                gpu::cuda::CUDAExecutor *cuda_executor =
-                    dynamic_cast<gpu::cuda::CUDAExecutor *>(parent);
-                if (cuda_executor == nullptr) {
-                  LOG(ERROR)
-                      << "Attempting to initialize an instance of the cuBLAS "
-                      << "support library with a non-CUDA StreamExecutor";
-                  return nullptr;
-                }
-
-                gpu::cuda::CUDABlas *blas =
-                    new gpu::cuda::CUDABlas(cuda_executor);
-                if (!blas->Init()) {
-                  // Note: Init() will log a more specific error.
-                  delete blas;
-                  return nullptr;
-                }
-                return blas;
-              });
+  port::Status status =
+      PluginRegistry::Instance()->RegisterFactory<PluginRegistry::BlasFactory>(
+          cuda::kCudaPlatformId, cuda::kCuBlasPlugin, "cuBLAS",
+          [](internal::StreamExecutorInterface *parent) -> blas::BlasSupport * {
+            cuda::CUDAExecutor *cuda_executor =
+                dynamic_cast<cuda::CUDAExecutor *>(parent);
+            if (cuda_executor == nullptr) {
+              LOG(ERROR)
+                  << "Attempting to initialize an instance of the cuBLAS "
+                  << "support library with a non-CUDA StreamExecutor";
+              return nullptr;
+            }
+
+            cuda::CUDABlas *blas = new cuda::CUDABlas(cuda_executor);
+            if (!blas->Init()) {
+              // Note: Init() will log a more specific error.
+              delete blas;
+              return nullptr;
+            }
+            return blas;
+          });
 
   if (!status.ok()) {
     LOG(ERROR) << "Unable to register cuBLAS factory: "
                << status.error_message();
   }
 
-  gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId,
-                                                     gpu::PluginKind::kBlas,
-                                                     gpu::cuda::kCuBlasPlugin);
+  PluginRegistry::Instance()->SetDefaultFactory(
+      cuda::kCudaPlatformId, PluginKind::kBlas, cuda::kCuBlasPlugin);
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 REGISTER_MODULE_INITIALIZER(register_cublas,
-                            { perftools::gputools::initialize_cublas(); });
+                            { stream_executor::initialize_cublas(); });
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index deb211c04b..55c414a1f9 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -29,8 +29,7 @@ limitations under the License.
 
 typedef struct cublasContext *cublasHandle_t;
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 
@@ -162,7 +161,6 @@ class CUDABlas : public blas::BlasSupport {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index 933c103f52..feb529297e 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -51,8 +51,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/str_util.h"
 #include "tensorflow/stream_executor/lib/inlined_vector.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 #ifdef __APPLE__
@@ -384,5 +383,4 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
 
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.h b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
index aa68321acc..f2db2eb20a 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.h
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
@@ -22,8 +22,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // e.g. DriverVersion{346, 3, 4}
@@ -93,7 +92,6 @@ class Diagnostician {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 4a6b2bf5d7..d673e19007 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -59,8 +59,7 @@ NarrowT CheckedNarrowing(const WideT& wide) {
 
 }  // namespace
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 using dnn::BatchDescriptor;
 using dnn::FilterDescriptor;
@@ -159,7 +158,7 @@ static port::ThreadPool* GetCudaThreadpool() {
   return cudnn_threadpool;
 }
 
-#define PERFTOOLS_GPUTOOLS_CUDNN_WRAP(__name)                      \
+#define STREAM_EXECUTOR_CUDNN_WRAP(__name)                         \
   struct WrapperShim__##__name {                                   \
     template <typename... Args>                                    \
     cudnnStatus_t operator()(CUDAExecutor* parent, Args... args) { \
@@ -169,7 +168,7 @@ static port::ThreadPool* GetCudaThreadpool() {
     }                                                              \
   } __name;
 
-#define PERFTOOLS_GPUTOOLS_CUDNN_WRAP_WITH_CHECKED_STREAM(__name)        \
+#define STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM(__name)           \
   struct WrapperShim__##__name {                                         \
     template <typename... Args>                                          \
     cudnnStatus_t operator()(CudnnSupport* dnn, Stream* s, Args... args) \
@@ -220,7 +219,7 @@ struct WrapperShim__cudnnSetStream {
   __macro(cudnnSetFilterNdDescriptor)
 
 // clang-format on
-CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH(STREAM_EXECUTOR_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH
 
 // clang-format off
@@ -242,7 +241,7 @@ CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 
 // clang-format on
 CUDNN_DNN_ROUTINE_EACH_WITH_STREAM(
-    PERFTOOLS_GPUTOOLS_CUDNN_WRAP_WITH_CHECKED_STREAM)
+    STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM)
 #undef CUDNN_DNN_ROUTINE_EACH_WITH_STREAM
 
 // APIs available after R3:
@@ -252,7 +251,7 @@ CUDNN_DNN_ROUTINE_EACH_WITH_STREAM(
   __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
   __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
   __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(STREAM_EXECUTOR_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
 #endif
 
@@ -266,7 +265,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R3(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 // clang-format on
 
 CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM(
-    PERFTOOLS_GPUTOOLS_CUDNN_WRAP_WITH_CHECKED_STREAM)
+    STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM)
 #undef CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM
 #endif
 
@@ -293,7 +292,7 @@ CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM(
   __macro(cudnnGetFilterNdDescriptor)
 
 // clang-format on
-CUDNN_DNN_ROUTINE_EACH_R5(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_R5(STREAM_EXECUTOR_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH_R5
 
 // clang-format off
@@ -305,7 +304,7 @@ CUDNN_DNN_ROUTINE_EACH_R5(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 
 // clang-format on
 CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM(
-    PERFTOOLS_GPUTOOLS_CUDNN_WRAP_WITH_CHECKED_STREAM)
+    STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM)
 #undef CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM
 #endif
 
@@ -316,7 +315,7 @@ CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM(
   __macro(cudnnSetRNNDescriptor_v6)
 
 // clang-format on
-CUDNN_DNN_ROUTINE_EACH_R6(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_R6(STREAM_EXECUTOR_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH_R6
 
 // clang-format off
@@ -325,7 +324,7 @@ CUDNN_DNN_ROUTINE_EACH_R6(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 
 // clang-format on
 CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM(
-    PERFTOOLS_GPUTOOLS_CUDNN_WRAP_WITH_CHECKED_STREAM)
+    STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM)
 #undef CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM
 #endif
 
@@ -337,7 +336,7 @@ CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM(
   __macro(cudnnSetRNNMatrixMathType)
 
 // clang-format on
-CUDNN_DNN_ROUTINE_EACH_R7(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_R7(STREAM_EXECUTOR_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH_R7
 #endif
 
@@ -4727,46 +4726,39 @@ bool CudnnSupport::DeriveOutputBatchDescriptor(
 
 }  // namespace cuda
 
-namespace gpu = ::perftools::gputools;
-
 void initialize_cudnn() {
-  gpu::port::Status status =
-      gpu::PluginRegistry::Instance()
-          ->RegisterFactory<gpu::PluginRegistry::DnnFactory>(
-              gpu::cuda::kCudaPlatformId, gpu::cuda::kCuDnnPlugin, "cuDNN",
-              [](gpu::internal::StreamExecutorInterface*
-                     parent) -> gpu::dnn::DnnSupport* {
-                gpu::cuda::CUDAExecutor* cuda_executor =
-                    dynamic_cast<gpu::cuda::CUDAExecutor*>(parent);
-                if (cuda_executor == nullptr) {
-                  LOG(ERROR)
-                      << "Attempting to initialize an instance of the cuBLAS "
-                      << "support library with a non-CUDA StreamExecutor";
-                  return nullptr;
-                }
-
-                gpu::cuda::CudnnSupport* dnn =
-                    new gpu::cuda::CudnnSupport(cuda_executor);
-                if (!dnn->Init().ok()) {
-                  // Note: Init() will log a more specific error.
-                  delete dnn;
-                  return nullptr;
-                }
-                return dnn;
-              });
+  port::Status status =
+      PluginRegistry::Instance()->RegisterFactory<PluginRegistry::DnnFactory>(
+          cuda::kCudaPlatformId, cuda::kCuDnnPlugin, "cuDNN",
+          [](internal::StreamExecutorInterface* parent) -> dnn::DnnSupport* {
+            cuda::CUDAExecutor* cuda_executor =
+                dynamic_cast<cuda::CUDAExecutor*>(parent);
+            if (cuda_executor == nullptr) {
+              LOG(ERROR)
+                  << "Attempting to initialize an instance of the cuBLAS "
+                  << "support library with a non-CUDA StreamExecutor";
+              return nullptr;
+            }
+
+            cuda::CudnnSupport* dnn = new cuda::CudnnSupport(cuda_executor);
+            if (!dnn->Init().ok()) {
+              // Note: Init() will log a more specific error.
+              delete dnn;
+              return nullptr;
+            }
+            return dnn;
+          });
 
   if (!status.ok()) {
     LOG(ERROR) << "Unable to register cuDNN factory: "
                << status.error_message();
   }
 
-  gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId,
-                                                     gpu::PluginKind::kDnn,
-                                                     gpu::cuda::kCuDnnPlugin);
+  PluginRegistry::Instance()->SetDefaultFactory(
+      cuda::kCudaPlatformId, PluginKind::kDnn, cuda::kCuDnnPlugin);
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 REGISTER_MODULE_INITIALIZER(register_cudnn,
-                            { perftools::gputools::initialize_cudnn(); });
+                            { stream_executor::initialize_cudnn(); });
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 7518b23757..e6d12bfef9 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -26,8 +26,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/temporary_device_memory.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 class CUDAExecutor;
@@ -810,7 +809,6 @@ class CudnnSupport : public dnn::DnnSupport {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index 58e1e58c59..fedf4f53b8 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -53,8 +53,7 @@ bool FLAGS_gpuexec_cuda_device_0_only = false;
 // matches the expected one.
 constexpr bool kVerifyCudaContext = false;
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 namespace {
@@ -1649,5 +1648,4 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
index fa9172b3f0..a9969e247e 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@@ -27,8 +27,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 #include "cuda/include/cuda.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // Identifies the memory space where an allocation resides. See
@@ -506,7 +505,6 @@ class CudaContext {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_event.cc b/tensorflow/stream_executor/cuda/cuda_event.cc
index 1b41502300..96dcf17356 100644
--- a/tensorflow/stream_executor/cuda/cuda_event.cc
+++ b/tensorflow/stream_executor/cuda/cuda_event.cc
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 CUDAEvent::CUDAEvent(CUDAExecutor* parent)
@@ -68,5 +67,4 @@ const CUevent& CUDAEvent::cuda_event() {
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_event.h b/tensorflow/stream_executor/cuda/cuda_event.h
index 56667e65d3..f62344672e 100644
--- a/tensorflow/stream_executor/cuda/cuda_event.h
+++ b/tensorflow/stream_executor/cuda/cuda_event.h
@@ -21,8 +21,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/event.h"
 #include "tensorflow/stream_executor/lib/status.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // CUDAEvent wraps a CUevent in the platform-independent EventInterface
@@ -58,7 +57,6 @@ class CUDAEvent : public internal::EventInterface {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
index a922f14fb4..5b34740f9f 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -31,8 +31,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuFftPlugin);
@@ -44,7 +43,7 @@ namespace wrap {
 // manner on first use. This dynamic loading technique is used to avoid DSO
 // dependencies on vendor libraries which may or may not be available in the
 // deployed binary environment.
-#define PERFTOOLS_GPUTOOLS_CUFFT_WRAP(__name)                    \
+#define STREAM_EXECUTOR_CUFFT_WRAP(__name)                       \
   struct WrapperShim__##__name {                                 \
     template <typename... Args>                                  \
     cufftResult operator()(CUDAExecutor *parent, Args... args) { \
@@ -68,7 +67,7 @@ namespace wrap {
                                               __macro(cufftGetSizeMany)        \
                                                   __macro(cufftMakePlanMany)
 
-CUFFT_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUFFT_WRAP)
+CUFFT_ROUTINE_EACH(STREAM_EXECUTOR_CUFFT_WRAP)
 
 }  // namespace wrap
 
@@ -514,62 +513,59 @@ bool CUDAFft::DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
   return true;
 }
 
-#define PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(__type, __fft_type1, __fft_type2, \
-                                           __fft_type3)                      \
-  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                       \
-                      const DeviceMemory<std::complex<__type>> &input,       \
-                      DeviceMemory<std::complex<__type>> *output) {          \
-    return DoFftWithDirectionInternal(                                       \
-        stream, plan, wrap::cufftExec##__fft_type1, input, output);          \
-  }                                                                          \
-  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                       \
-                      const DeviceMemory<__type> &input,                     \
-                      DeviceMemory<std::complex<__type>> *output) {          \
-    return DoFftInternal(stream, plan, wrap::cufftExec##__fft_type2, input,  \
-                         output);                                            \
-  }                                                                          \
-  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                       \
-                      const DeviceMemory<std::complex<__type>> &input,       \
-                      DeviceMemory<__type> *output) {                        \
-    return DoFftInternal(stream, plan, wrap::cufftExec##__fft_type3, input,  \
-                         output);                                            \
+#define STREAM_EXECUTOR_CUDA_DEFINE_FFT(__type, __fft_type1, __fft_type2,   \
+                                        __fft_type3)                        \
+  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                      \
+                      const DeviceMemory<std::complex<__type>> &input,      \
+                      DeviceMemory<std::complex<__type>> *output) {         \
+    return DoFftWithDirectionInternal(                                      \
+        stream, plan, wrap::cufftExec##__fft_type1, input, output);         \
+  }                                                                         \
+  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                      \
+                      const DeviceMemory<__type> &input,                    \
+                      DeviceMemory<std::complex<__type>> *output) {         \
+    return DoFftInternal(stream, plan, wrap::cufftExec##__fft_type2, input, \
+                         output);                                           \
+  }                                                                         \
+  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                      \
+                      const DeviceMemory<std::complex<__type>> &input,      \
+                      DeviceMemory<__type> *output) {                       \
+    return DoFftInternal(stream, plan, wrap::cufftExec##__fft_type3, input, \
+                         output);                                           \
   }
 
-PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(float, C2C, R2C, C2R)
-PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(double, Z2Z, D2Z, Z2D)
+STREAM_EXECUTOR_CUDA_DEFINE_FFT(float, C2C, R2C, C2R)
+STREAM_EXECUTOR_CUDA_DEFINE_FFT(double, Z2Z, D2Z, Z2D)
 
-#undef PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT
+#undef STREAM_EXECUTOR_CUDA_DEFINE_FFT
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
-
-namespace gpu = ::perftools::gputools;
-
-REGISTER_MODULE_INITIALIZER(register_cufft, {
-  gpu::port::Status status =
-      gpu::PluginRegistry::Instance()
-          ->RegisterFactory<gpu::PluginRegistry::FftFactory>(
-              gpu::cuda::kCudaPlatformId, gpu::cuda::kCuFftPlugin, "cuFFT",
-              [](gpu::internal::StreamExecutorInterface
-                     *parent) -> gpu::fft::FftSupport * {
-                gpu::cuda::CUDAExecutor *cuda_executor =
-                    dynamic_cast<gpu::cuda::CUDAExecutor *>(parent);
-                if (cuda_executor == nullptr) {
-                  LOG(ERROR)
-                      << "Attempting to initialize an instance of the cuFFT "
-                      << "support library with a non-CUDA StreamExecutor";
-                  return nullptr;
-                }
-
-                return new gpu::cuda::CUDAFft(cuda_executor);
-              });
+
+void initialize_cufft() {
+  port::Status status =
+      PluginRegistry::Instance()->RegisterFactory<PluginRegistry::FftFactory>(
+          cuda::kCudaPlatformId, cuda::kCuFftPlugin, "cuFFT",
+          [](internal::StreamExecutorInterface *parent) -> fft::FftSupport * {
+            cuda::CUDAExecutor *cuda_executor =
+                dynamic_cast<cuda::CUDAExecutor *>(parent);
+            if (cuda_executor == nullptr) {
+              LOG(ERROR) << "Attempting to initialize an instance of the cuFFT "
+                         << "support library with a non-CUDA StreamExecutor";
+              return nullptr;
+            }
+
+            return new cuda::CUDAFft(cuda_executor);
+          });
   if (!status.ok()) {
     LOG(ERROR) << "Unable to register cuFFT factory: "
                << status.error_message();
   }
 
-  gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId,
-                                                     gpu::PluginKind::kFft,
-                                                     gpu::cuda::kCuFftPlugin);
-});
+  PluginRegistry::Instance()->SetDefaultFactory(
+      cuda::kCudaPlatformId, PluginKind::kFft, cuda::kCuFftPlugin);
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(register_cufft,
+                            { stream_executor::initialize_cufft(); });
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.h b/tensorflow/stream_executor/cuda/cuda_fft.h
index 04c7dfe501..8171e61418 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.h
+++ b/tensorflow/stream_executor/cuda/cuda_fft.h
@@ -26,8 +26,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 
@@ -133,7 +132,6 @@ class CUDAFft : public fft::FftSupport {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 5ecaf46b8c..9700daca89 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -66,8 +66,7 @@ limitations under the License.
 extern bool FLAGS_check_gpu_leaks;
 bool FLAGS_prefer_cubin_to_ptx = true;
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // Hook that can be used to CUBIN-ate PTX before it is loaded into the driver.
@@ -1168,17 +1167,14 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
 
 }  // namespace cuda
 
-namespace gpu = ::perftools::gputools;
-
 void initialize_cuda_gpu_executor() {
-  *gpu::internal::MakeCUDAExecutorImplementation() = [](
-      const gpu::PluginConfig &config) {
-    return new gpu::cuda::CUDAExecutor{config};
+  *internal::MakeCUDAExecutorImplementation() = [](const PluginConfig &config) {
+    return new cuda::CUDAExecutor{config};
   };
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(
-    cuda_gpu_executor, {perftools::gputools::initialize_cuda_gpu_executor();});
+REGISTER_MODULE_INITIALIZER(cuda_gpu_executor, {
+  stream_executor::initialize_cuda_gpu_executor();
+});
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index dbbbcd476f..f686685474 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -35,8 +35,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // CUDA-platform implementation of the platform-agnostic
@@ -273,7 +272,6 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_helpers.h b/tensorflow/stream_executor/cuda/cuda_helpers.h
index 6a6134bf88..d55706c66a 100644
--- a/tensorflow/stream_executor/cuda/cuda_helpers.h
+++ b/tensorflow/stream_executor/cuda/cuda_helpers.h
@@ -27,8 +27,7 @@ limitations under the License.
 #include "cuda/include/cuComplex.h"
 #include "cuda/include/cuda.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 template <typename ElemT>
 class DeviceMemory;
@@ -101,7 +100,6 @@ inline cuDoubleComplex CUDAComplexValue(std::complex<double> val) {
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_kernel.h b/tensorflow/stream_executor/cuda/cuda_kernel.h
index 88d29fddd0..beaebe8f12 100644
--- a/tensorflow/stream_executor/cuda/cuda_kernel.h
+++ b/tensorflow/stream_executor/cuda/cuda_kernel.h
@@ -40,8 +40,7 @@ limitations under the License.
     "CUDA runtime being included into CUDA GPU executor; should be driver only."
 #endif
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // Wraps a CUfunction to implement the platform-independent KernelInterface.
@@ -124,7 +123,6 @@ inline CUDAKernel *AsCUDAKernel(KernelBase *kernel) {
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc
index 3a73846148..7a6ef5a248 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform.cc
@@ -24,8 +24,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 namespace {
 
@@ -41,16 +40,16 @@ const DeviceOptions GetDeviceOptionsFromEnv() {
       std::getenv("TF_CUDA_PLATFORM_GPU_DEVICE_SCHEDULE");
 
   if (gpu_schedule_string == nullptr) {
-    return perftools::gputools::DeviceOptions::Default();
+    return DeviceOptions::Default();
   }
 
   unsigned device_flags = 0;
   if (strcmp(kScheduleSpinString, gpu_schedule_string) == 0) {
-    device_flags = perftools::gputools::DeviceOptions::kScheduleSpin;
+    device_flags = DeviceOptions::kScheduleSpin;
   } else if (strcmp(kScheduleYieldString, gpu_schedule_string) == 0) {
-    device_flags = perftools::gputools::DeviceOptions::kScheduleYield;
+    device_flags = DeviceOptions::kScheduleYield;
   } else if (strcmp(kScheduleBlockingSyncString, gpu_schedule_string) == 0) {
-    device_flags = perftools::gputools::DeviceOptions::kScheduleBlockingSync;
+    device_flags = DeviceOptions::kScheduleBlockingSync;
   } else {
     LOG(QFATAL) << "Unknown option for environment variable "
                    "TF_CUDA_PLATFORM_GPU_DEVICE_SCHEDULE "
@@ -59,7 +58,7 @@ const DeviceOptions GetDeviceOptionsFromEnv() {
                 << ", " << kScheduleYieldString << "}";
   }
 
-  return perftools::gputools::DeviceOptions(device_flags);
+  return DeviceOptions(device_flags);
 }
 
 }  // namespace
@@ -202,11 +201,10 @@ static void InitializeCudaPlatform() {
   SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 REGISTER_MODULE_INITIALIZER(cuda_platform,
-                            perftools::gputools::InitializeCudaPlatform());
+                            stream_executor::InitializeCudaPlatform());
 
 DECLARE_MODULE_INITIALIZER(multi_platform_manager);
 // Note that module initialization sequencing is not supported in the
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.h b/tensorflow/stream_executor/cuda/cuda_platform.h
index dab25602d0..fc0e15d5a6 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.h
+++ b/tensorflow/stream_executor/cuda/cuda_platform.h
@@ -31,8 +31,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/stream_executor/trace_listener.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // Opaque and unique identifier for the CUDA platform plugin.
@@ -104,7 +103,6 @@ class CudaPlatform : public Platform {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_platform_id.cc b/tensorflow/stream_executor/cuda/cuda_platform_id.cc
index dfd11a9abe..a7bb304cc8 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform_id.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform_id.cc
@@ -15,12 +15,10 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 PLATFORM_DEFINE_ID(kCudaPlatformId);
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_platform_id.h b/tensorflow/stream_executor/cuda/cuda_platform_id.h
index c677724517..92bcfd8372 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform_id.h
+++ b/tensorflow/stream_executor/cuda/cuda_platform_id.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // Opaque and unique identifier for the cuda platform.
@@ -30,7 +29,6 @@ namespace cuda {
 extern const Platform::Id kCudaPlatformId;
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_ID_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.cc b/tensorflow/stream_executor/cuda/cuda_rng.cc
index 8641b60227..e289e7ced5 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@@ -54,15 +54,14 @@ std::ostream &operator<<(std::ostream &in, const curandStatus_t &status) {
   }
 }
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuRandPlugin);
 
 namespace wrap {
 
-#define PERFTOOLS_GPUTOOLS_CURAND_WRAP(__name)                      \
+#define STREAM_EXECUTOR_CURAND_WRAP(__name)                         \
   struct WrapperShim__##__name {                                    \
     template <typename... Args>                                     \
     curandStatus_t operator()(CUDAExecutor *parent, Args... args) { \
@@ -71,15 +70,15 @@ namespace wrap {
     }                                                               \
   } __name;
 
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandCreateGenerator);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandDestroyGenerator);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandSetStream);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateUniform);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateUniformDouble);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandSetPseudoRandomGeneratorSeed);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandSetGeneratorOffset);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateNormal);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateNormalDouble);
+STREAM_EXECUTOR_CURAND_WRAP(curandCreateGenerator);
+STREAM_EXECUTOR_CURAND_WRAP(curandDestroyGenerator);
+STREAM_EXECUTOR_CURAND_WRAP(curandSetStream);
+STREAM_EXECUTOR_CURAND_WRAP(curandGenerateUniform);
+STREAM_EXECUTOR_CURAND_WRAP(curandGenerateUniformDouble);
+STREAM_EXECUTOR_CURAND_WRAP(curandSetPseudoRandomGeneratorSeed);
+STREAM_EXECUTOR_CURAND_WRAP(curandSetGeneratorOffset);
+STREAM_EXECUTOR_CURAND_WRAP(curandGenerateNormal);
+STREAM_EXECUTOR_CURAND_WRAP(curandGenerateNormalDouble);
 
 }  // namespace wrap
 
@@ -271,42 +270,40 @@ bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) {
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
-
-namespace gpu = ::perftools::gputools;
-
-REGISTER_MODULE_INITIALIZER(register_curand, {
-  gpu::port::Status status =
-      gpu::PluginRegistry::Instance()
-          ->RegisterFactory<gpu::PluginRegistry::RngFactory>(
-              gpu::cuda::kCudaPlatformId, gpu::cuda::kCuRandPlugin, "cuRAND",
-              [](gpu::internal::StreamExecutorInterface
-                     *parent) -> gpu::rng::RngSupport * {
-                gpu::cuda::CUDAExecutor *cuda_executor =
-                    dynamic_cast<gpu::cuda::CUDAExecutor *>(parent);
-                if (cuda_executor == nullptr) {
-                  LOG(ERROR)
-                      << "Attempting to initialize an instance of the cuRAND "
-                      << "support library with a non-CUDA StreamExecutor";
-                  return nullptr;
-                }
-
-                gpu::cuda::CUDARng *rng = new gpu::cuda::CUDARng(cuda_executor);
-                if (!rng->Init()) {
-                  // Note: Init() will log a more specific error.
-                  delete rng;
-                  return nullptr;
-                }
-                return rng;
-              });
+
+void initialize_curand() {
+  port::Status status =
+      PluginRegistry::Instance()->RegisterFactory<PluginRegistry::RngFactory>(
+          cuda::kCudaPlatformId, cuda::kCuRandPlugin, "cuRAND",
+          [](internal::StreamExecutorInterface *parent) -> rng::RngSupport * {
+            cuda::CUDAExecutor *cuda_executor =
+                dynamic_cast<cuda::CUDAExecutor *>(parent);
+            if (cuda_executor == nullptr) {
+              LOG(ERROR)
+                  << "Attempting to initialize an instance of the cuRAND "
+                  << "support library with a non-CUDA StreamExecutor";
+              return nullptr;
+            }
+
+            cuda::CUDARng *rng = new cuda::CUDARng(cuda_executor);
+            if (!rng->Init()) {
+              // Note: Init() will log a more specific error.
+              delete rng;
+              return nullptr;
+            }
+            return rng;
+          });
 
   if (!status.ok()) {
     LOG(ERROR) << "Unable to register cuRAND factory: "
                << status.error_message();
   }
 
-  gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId,
-                                                     gpu::PluginKind::kRng,
-                                                     gpu::cuda::kCuRandPlugin);
-});
+  PluginRegistry::Instance()->SetDefaultFactory(
+      cuda::kCudaPlatformId, PluginKind::kRng, cuda::kCuRandPlugin);
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(register_curand,
+                            { stream_executor::initialize_curand(); });
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.h b/tensorflow/stream_executor/cuda/cuda_rng.h
index 5bbfd0b37a..57ef398aaa 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.h
+++ b/tensorflow/stream_executor/cuda/cuda_rng.h
@@ -24,8 +24,7 @@ limitations under the License.
 
 typedef struct curandGenerator_st *curandGenerator_t;
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 template <typename ElemT>
@@ -98,7 +97,6 @@ class CUDARng : public rng::RngSupport {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_stream.cc b/tensorflow/stream_executor/cuda/cuda_stream.cc
index 3eb37a7d84..b5aa7694f7 100644
--- a/tensorflow/stream_executor/cuda/cuda_stream.cc
+++ b/tensorflow/stream_executor/cuda/cuda_stream.cc
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/stream.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 bool CUDAStream::Init() {
@@ -59,5 +58,4 @@ CUstream AsCUDAStreamValue(Stream *stream) {
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_stream.h b/tensorflow/stream_executor/cuda/cuda_stream.h
index 7358243dc4..02edff6431 100644
--- a/tensorflow/stream_executor/cuda/cuda_stream.h
+++ b/tensorflow/stream_executor/cuda/cuda_stream.h
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 class CUDAExecutor;
@@ -89,7 +88,6 @@ CUDAStream *AsCUDAStream(Stream *stream);
 CUstream AsCUDAStreamValue(Stream *stream);
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.cc b/tensorflow/stream_executor/cuda/cuda_timer.cc
index 8532f08725..991a12a23d 100644
--- a/tensorflow/stream_executor/cuda/cuda_timer.cc
+++ b/tensorflow/stream_executor/cuda/cuda_timer.cc
@@ -20,8 +20,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/lib/status.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 bool CUDATimer::Init() {
@@ -92,5 +91,4 @@ bool CUDATimer::Stop(CUDAStream* stream) {
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.h b/tensorflow/stream_executor/cuda/cuda_timer.h
index 2abc55ec94..70554ec931 100644
--- a/tensorflow/stream_executor/cuda/cuda_timer.h
+++ b/tensorflow/stream_executor/cuda/cuda_timer.h
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 class CUDAExecutor;
@@ -60,13 +59,13 @@ class CUDATimer : public internal::TimerInterface {
   // events.
   float GetElapsedMilliseconds() const;
 
-  // See perftools::gputools::Timer::Microseconds().
+  // See Timer::Microseconds().
   // TODO(leary) make this into an error code interface...
   uint64 Microseconds() const override {
     return GetElapsedMilliseconds() * 1e3;
   }
 
-  // See perftools::GPUTools::Timer::Nanoseconds().
+  // See Timer::Nanoseconds().
   uint64 Nanoseconds() const override { return GetElapsedMilliseconds() * 1e6; }
 
  private:
@@ -85,7 +84,6 @@ struct TimerDeleter {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
diff --git a/tensorflow/stream_executor/cuda/cudnn_version.cc b/tensorflow/stream_executor/cuda/cudnn_version.cc
index 5591801aae..e8fcc03618 100644
--- a/tensorflow/stream_executor/cuda/cudnn_version.cc
+++ b/tensorflow/stream_executor/cuda/cudnn_version.cc
@@ -15,8 +15,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/cuda/cudnn_version.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
@@ -38,5 +37,4 @@ bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cudnn_version.h b/tensorflow/stream_executor/cuda/cudnn_version.h
index 2ed02e1700..6464e7f8e8 100644
--- a/tensorflow/stream_executor/cuda/cudnn_version.h
+++ b/tensorflow/stream_executor/cuda/cudnn_version.h
@@ -20,8 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/strcat.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 struct CudnnVersion {
@@ -46,7 +45,6 @@ bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
                                         CudnnVersion loaded_version);
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDNN_VERSION_H_
diff --git a/tensorflow/stream_executor/cuda/cudnn_version_test.cc b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
index 42b3dc8cc6..7d4c6399d0 100644
--- a/tensorflow/stream_executor/cuda/cudnn_version_test.cc
+++ b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/test.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 namespace {
 
@@ -70,5 +69,4 @@ TEST(IsSourceCompatibleWithCudnnLibraryTest, Basic) {
 
 }  // namespace
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc
index 52f5319a3b..8ca0677f8a 100644
--- a/tensorflow/stream_executor/device_description.cc
+++ b/tensorflow/stream_executor/device_description.cc
@@ -21,8 +21,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/mathutil.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 static const uint64 kUninitializedUint64 = -1ULL;
 /* static */ const char *DeviceDescription::kUndefinedString = "<undefined>";
@@ -234,6 +233,4 @@ uint64 CalculateRegisterLimitForTargetOccupancy(
   return 0;
 }
 
-
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h
index fcf0928096..7f99d81ef3 100644
--- a/tensorflow/stream_executor/device_description.h
+++ b/tensorflow/stream_executor/device_description.h
@@ -27,8 +27,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/launch_dim.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace internal {
 class DeviceDescriptionBuilder;
 }  // namespace internal
@@ -388,7 +387,6 @@ uint64 CalculateRegisterLimitForTargetOccupancy(
     const DeviceDescription &device_description, uint64 shared_memory_per_block,
     const ThreadDim &thread_dims, uint64 target_blocks_per_core);
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
diff --git a/tensorflow/stream_executor/device_memory.h b/tensorflow/stream_executor/device_memory.h
index 4c92b7dc78..5a5334e0f5 100644
--- a/tensorflow/stream_executor/device_memory.h
+++ b/tensorflow/stream_executor/device_memory.h
@@ -32,6 +32,16 @@ limitations under the License.
 namespace perftools {
 namespace gputools {
 
+// Temporarily pull stream_executor into perftools::gputools while we migrate
+// code to the new namespace.  TODO(b/77980417): Remove this once we've
+// completed the migration.
+using namespace stream_executor;  // NOLINT[build/namespaces]
+
+}  // namespace gputools
+}  // namespace perftools
+
+namespace stream_executor {
+
 class StreamExecutor;
 
 // void*-analogous device memory allocation. For the typed variation, see
@@ -280,7 +290,6 @@ static_assert(sizeof(Float2) == 2 * sizeof(float), "Float2 must be packed");
 static_assert(sizeof(Float4) == 4 * sizeof(float), "Float4 must be packed");
 static_assert(sizeof(Double2) == 2 * sizeof(double), "Double2 must be packed");
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
diff --git a/tensorflow/stream_executor/device_options.h b/tensorflow/stream_executor/device_options.h
index 169325e7d1..2646950f42 100644
--- a/tensorflow/stream_executor/device_options.h
+++ b/tensorflow/stream_executor/device_options.h
@@ -25,8 +25,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Indicates a set of options for a device's usage, which generally must be
 // provided at StreamExecutor device-initialization time.
@@ -84,7 +83,6 @@ struct DeviceOptions {
   unsigned flags_;
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_OPTIONS_H_
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index 0a3c4bcf50..6edb572820 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -18,8 +18,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace dnn {
 
 bool DnnSupport::GetConvolveAlgorithms(
@@ -554,5 +553,4 @@ string NormalizeDescriptor::ToShortString() const {
 }
 
 }  // namespace dnn
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 3c47d2c2e8..8e202d115a 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -38,8 +38,7 @@ namespace Eigen {
 struct half;
 }  // namespace Eigen
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class HostBuffer;
 class Stream;
@@ -2285,7 +2284,6 @@ class DnnSupport {
 };
 
 }  // namespace dnn
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_DNN_H_
diff --git a/tensorflow/stream_executor/dso_loader.cc b/tensorflow/stream_executor/dso_loader.cc
index 9516883627..114143b3ab 100644
--- a/tensorflow/stream_executor/dso_loader.cc
+++ b/tensorflow/stream_executor/dso_loader.cc
@@ -37,8 +37,7 @@ limitations under the License.
 #include "cuda/cuda_config.h"
 #endif
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace internal {
 
 string GetCudaVersion() { return TF_CUDA_VERSION; }
@@ -291,5 +290,4 @@ static std::vector<string>* CreatePrimordialRpaths() {
 }
 
 }  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/dso_loader.h b/tensorflow/stream_executor/dso_loader.h
index 354c7b50b8..9ee081cb3d 100644
--- a/tensorflow/stream_executor/dso_loader.h
+++ b/tensorflow/stream_executor/dso_loader.h
@@ -28,8 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/mutex.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace internal {
 
 // Permits StreamExecutor code to dynamically load a pre-determined set of
@@ -114,7 +113,6 @@ class CachedDsoLoader {
 };
 
 }  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_DSO_LOADER_H_
diff --git a/tensorflow/stream_executor/event.cc b/tensorflow/stream_executor/event.cc
index c423a453e9..50a6edd80b 100644
--- a/tensorflow/stream_executor/event.cc
+++ b/tensorflow/stream_executor/event.cc
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/stream_executor/stream.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 Event::Event(StreamExecutor* stream_exec)
     : stream_exec_(stream_exec),
@@ -48,5 +47,4 @@ Event::Status Event::PollForStatus() {
   return stream_exec_->PollForEventStatus(this);
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/event.h b/tensorflow/stream_executor/event.h
index a06c26ea51..1f37262c78 100644
--- a/tensorflow/stream_executor/event.h
+++ b/tensorflow/stream_executor/event.h
@@ -20,8 +20,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 namespace internal {
 class EventInterface;
@@ -76,7 +75,6 @@ class Event {
   SE_DISALLOW_COPY_AND_ASSIGN(Event);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_EVENT_H_
diff --git a/tensorflow/stream_executor/executor_cache.cc b/tensorflow/stream_executor/executor_cache.cc
index d1a8aae167..0b3ad7ebbc 100644
--- a/tensorflow/stream_executor/executor_cache.cc
+++ b/tensorflow/stream_executor/executor_cache.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 port::StatusOr<StreamExecutor*> ExecutorCache::GetOrCreate(
     const StreamExecutorConfig& config,
@@ -104,5 +103,4 @@ ExecutorCache::Entry::~Entry() {
   configurations.clear();
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/executor_cache.h b/tensorflow/stream_executor/executor_cache.h
index 12f2275f6d..bbeeaed787 100644
--- a/tensorflow/stream_executor/executor_cache.h
+++ b/tensorflow/stream_executor/executor_cache.h
@@ -24,8 +24,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Utility class to allow Platform objects to manage cached StreamExecutors.
 // Thread-safe.
@@ -76,7 +75,6 @@ class ExecutorCache {
   SE_DISALLOW_COPY_AND_ASSIGN(ExecutorCache);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_CACHE_H_
diff --git a/tensorflow/stream_executor/fft.h b/tensorflow/stream_executor/fft.h
index 6b1728829a..814efb2e92 100644
--- a/tensorflow/stream_executor/fft.h
+++ b/tensorflow/stream_executor/fft.h
@@ -48,8 +48,7 @@ limitations under the License.
 #include <memory>
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 template <typename ElemT>
@@ -210,7 +209,7 @@ class FftSupport {
 
 // Macro used to quickly declare overrides for abstract virtuals in the
 // fft::FftSupport base class. Assumes that it's emitted somewhere inside the
-// ::perftools::gputools namespace.
+// ::stream_executor namespace.
 #define TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES                   \
   std::unique_ptr<fft::Plan> Create1dPlan(Stream *stream, uint64 num_x,        \
                                           fft::Type type, bool in_place_fft)   \
@@ -265,7 +264,6 @@ class FftSupport {
              DeviceMemory<double> *output) override;
 
 }  // namespace fft
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_FFT_H_
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc
index 542f521ef7..2c4819651a 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@@ -28,8 +28,7 @@ limitations under the License.
 
 bool FLAGS_stream_executor_cpu_real_clock_rate = false;
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 HostStream *AsHostStream(Stream *stream) {
@@ -266,5 +265,4 @@ rng::RngSupport *HostExecutor::CreateRng() {
 }
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
index e2c0e6d6b7..0c3991c151 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -28,8 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 // An implementation of StreamExecutor that does no communication or interaction
@@ -210,7 +209,6 @@ class HostExecutor : public internal::StreamExecutorInterface {
 };
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_GPU_EXECUTOR_H_
diff --git a/tensorflow/stream_executor/host/host_platform.cc b/tensorflow/stream_executor/host/host_platform.cc
index 2cb7d36967..00a17a05ed 100644
--- a/tensorflow/stream_executor/host/host_platform.cc
+++ b/tensorflow/stream_executor/host/host_platform.cc
@@ -26,10 +26,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status_macros.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
-namespace gpu = ::perftools::gputools;
-
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 HostPlatform::HostPlatform() : name_("Host") {}
@@ -93,16 +90,15 @@ void HostPlatform::UnregisterTraceListener(TraceListener* listener) {
 }
 
 static void InitializeHostPlatform() {
-  std::unique_ptr<gpu::Platform> platform(new gpu::host::HostPlatform);
-  SE_CHECK_OK(gpu::MultiPlatformManager::RegisterPlatform(std::move(platform)));
+  std::unique_ptr<Platform> platform(new host::HostPlatform);
+  SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(
-    host_platform, perftools::gputools::host::InitializeHostPlatform());
+REGISTER_MODULE_INITIALIZER(host_platform,
+                            stream_executor::host::InitializeHostPlatform());
 
 DECLARE_MODULE_INITIALIZER(multi_platform_manager);
 // Note that module initialization sequencing is not supported in the
diff --git a/tensorflow/stream_executor/host/host_platform.h b/tensorflow/stream_executor/host/host_platform.h
index 0faec6c8b7..c6f46a2cc4 100644
--- a/tensorflow/stream_executor/host/host_platform.h
+++ b/tensorflow/stream_executor/host/host_platform.h
@@ -33,8 +33,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/stream_executor/trace_listener.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 // Host (CPU) platform plugin, registered as a singleton value via module
@@ -79,7 +78,6 @@ class HostPlatform : public Platform {
 };
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_H_
diff --git a/tensorflow/stream_executor/host/host_platform_id.cc b/tensorflow/stream_executor/host/host_platform_id.cc
index 69a203f298..2256bccec3 100644
--- a/tensorflow/stream_executor/host/host_platform_id.cc
+++ b/tensorflow/stream_executor/host/host_platform_id.cc
@@ -15,12 +15,10 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/host/host_platform_id.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 PLATFORM_DEFINE_ID(kHostPlatformId);
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/host/host_platform_id.h b/tensorflow/stream_executor/host/host_platform_id.h
index 61d84ea2e2..18d1f282f1 100644
--- a/tensorflow/stream_executor/host/host_platform_id.h
+++ b/tensorflow/stream_executor/host/host_platform_id.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 // Opaque and unique identifier for the host platform.
@@ -30,7 +29,6 @@ namespace host {
 extern const Platform::Id kHostPlatformId;
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_ID_H_
diff --git a/tensorflow/stream_executor/host/host_stream.cc b/tensorflow/stream_executor/host/host_stream.cc
index 5961c31516..5a7d3b3dd4 100644
--- a/tensorflow/stream_executor/host/host_stream.cc
+++ b/tensorflow/stream_executor/host/host_stream.cc
@@ -17,8 +17,7 @@ limitations under the License.
 // the HostExecutor implementation.
 #include "tensorflow/stream_executor/host/host_stream.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 HostStream::HostStream()
@@ -53,5 +52,4 @@ void HostStream::BlockUntilDone() {
 
 }  // namespace host
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/host/host_stream.h b/tensorflow/stream_executor/host/host_stream.h
index 9894d17feb..5d7b8a3782 100644
--- a/tensorflow/stream_executor/host/host_stream.h
+++ b/tensorflow/stream_executor/host/host_stream.h
@@ -24,8 +24,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 class HostStream : public internal::StreamInterface {
@@ -52,7 +51,6 @@ class HostStream : public internal::StreamInterface {
 };
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_STREAM_H_
diff --git a/tensorflow/stream_executor/host/host_timer.cc b/tensorflow/stream_executor/host/host_timer.cc
index d84d825c92..e138daf0e1 100644
--- a/tensorflow/stream_executor/host/host_timer.cc
+++ b/tensorflow/stream_executor/host/host_timer.cc
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 using std::chrono::duration_cast;
@@ -46,5 +45,4 @@ void HostTimer::StartNow() { start_time_ = clock::now(); }
 void HostTimer::StopNow() { duration_ = clock::now() - start_time_; }
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/host/host_timer.h b/tensorflow/stream_executor/host/host_timer.h
index 17af7c0521..5954b8023b 100644
--- a/tensorflow/stream_executor/host/host_timer.h
+++ b/tensorflow/stream_executor/host/host_timer.h
@@ -20,8 +20,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 class HostTimer : public internal::TimerInterface {
@@ -57,7 +56,6 @@ class HostTimer : public internal::TimerInterface {
 };
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_TIMER_H_
diff --git a/tensorflow/stream_executor/host_buffer.h b/tensorflow/stream_executor/host_buffer.h
index 8fa542e9ff..20299da517 100644
--- a/tensorflow/stream_executor/host_buffer.h
+++ b/tensorflow/stream_executor/host_buffer.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/dnn.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // A HostBuffer is a block of memory in host memory containing the data for a
 // dnn::BatchDescriptor using a device-dependent memory layout.
@@ -42,7 +41,6 @@ class HostBuffer {
   const dnn::BatchDescriptor descriptor_;
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_BUFFER_H_
diff --git a/tensorflow/stream_executor/kernel.cc b/tensorflow/stream_executor/kernel.cc
index 636199cfa2..d1aa596b73 100644
--- a/tensorflow/stream_executor/kernel.cc
+++ b/tensorflow/stream_executor/kernel.cc
@@ -27,8 +27,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 bool KernelMetadata::registers_per_thread(int *registers_per_thread) const {
   if (has_registers_per_thread_) {
@@ -103,5 +102,4 @@ void KernelBase::set_name(port::StringPiece name) {
   demangled_name_ = port::Demangle(stubless_name.data());
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/kernel.h b/tensorflow/stream_executor/kernel.h
index 5358eac1ae..2216884b87 100644
--- a/tensorflow/stream_executor/kernel.h
+++ b/tensorflow/stream_executor/kernel.h
@@ -64,7 +64,7 @@ limitations under the License.
 //
 // Users typically won't need to type out the TypedKernel signature in full, it
 // will be typedef'd by automatically generated code; for example, see
-// perftools::gputools::executor_sample::VecReduceAddKernel.
+// stream_executor::executor_sample::VecReduceAddKernel.
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_KERNEL_H_
 #define TENSORFLOW_STREAM_EXECUTOR_KERNEL_H_
@@ -82,8 +82,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class DeviceMemoryBase;
 template <typename ElemT>
@@ -639,8 +638,8 @@ struct KernelInvocationChecker {
   // NOTE: if you encounter an error here, you can see the mismatch by looking
   // at the end of the last error message, which will be of the form:
   //
-  //    ...::Compatible<const perftools::gputools::DeviceMemory<OneThing> &,
-  //                    perftools::gputools::DeviceMemory<AnotherThing>, true,
+  //    ...::Compatible<const stream_executor::DeviceMemory<OneThing> &,
+  //                    stream_executor::DeviceMemory<AnotherThing>, true,
   //                    0>'
   //    requested here
   //
@@ -711,7 +710,6 @@ struct KernelParamsOk<TypedKernel<Params...>, Args...> {
       std::tuple<Params...>, std::tuple<Args...>>::CheckAllNoStaticAssert();
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_KERNEL_H_
diff --git a/tensorflow/stream_executor/kernel_cache_config.h b/tensorflow/stream_executor/kernel_cache_config.h
index 9d7ab1b79f..e63d6c6a0c 100644
--- a/tensorflow/stream_executor/kernel_cache_config.h
+++ b/tensorflow/stream_executor/kernel_cache_config.h
@@ -18,8 +18,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_KERNEL_CACHE_CONFIG_H_
 #define TENSORFLOW_STREAM_EXECUTOR_KERNEL_CACHE_CONFIG_H_
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // This enum represents potential configurations of L1/shared memory when
 // running a particular kernel. These values represent user preference, and
@@ -38,7 +37,6 @@ enum class KernelCacheConfig {
   kPreferEqual,
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_KERNEL_CACHE_CONFIG_H_
diff --git a/tensorflow/stream_executor/kernel_spec.cc b/tensorflow/stream_executor/kernel_spec.cc
index 0404c573f0..6a1f0a591f 100644
--- a/tensorflow/stream_executor/kernel_spec.cc
+++ b/tensorflow/stream_executor/kernel_spec.cc
@@ -15,9 +15,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/kernel_spec.h"
 
-
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 KernelLoaderSpec::KernelLoaderSpec(port::StringPiece kernelname)
     : kernelname_(kernelname.ToString()) {}
@@ -247,5 +245,4 @@ MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory(
 
 MultiKernelLoaderSpec::MultiKernelLoaderSpec(size_t arity) : arity_(arity) {}
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/kernel_spec.h b/tensorflow/stream_executor/kernel_spec.h
index 3811bd833e..7cc23bb4e6 100644
--- a/tensorflow/stream_executor/kernel_spec.h
+++ b/tensorflow/stream_executor/kernel_spec.h
@@ -56,8 +56,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Describes how to load a kernel on a target platform.
 //
@@ -374,7 +373,6 @@ class MultiKernelLoaderSpec {
   size_t arity_;
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_
diff --git a/tensorflow/stream_executor/launch_dim.h b/tensorflow/stream_executor/launch_dim.h
index b95462667e..68f2f74840 100644
--- a/tensorflow/stream_executor/launch_dim.h
+++ b/tensorflow/stream_executor/launch_dim.h
@@ -21,7 +21,7 @@ limitations under the License.
 // a single PC in a unit called a warp. There is a maximum number of threads
 // that can execute in a shared-context entity called a block. Presently, that
 // number is 1024 -- again, something that should not be relied on from this
-// comment, but checked via perftools::gputools::DeviceDescription.
+// comment, but checked via stream_executor::DeviceDescription.
 //
 // For additional information, see
 // http://docs.nvidia.com/cuda/kepler-tuning-guide/#device-utilization-and-occupancy
@@ -40,8 +40,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Basic type that represents a 3-dimensional index space.
 struct Dim3D {
@@ -74,7 +73,6 @@ struct BlockDim : public Dim3D {
   }
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LAUNCH_DIM_H_
diff --git a/tensorflow/stream_executor/lib/array_slice.h b/tensorflow/stream_executor/lib/array_slice.h
index bef61bb2fc..8e3c4ca047 100644
--- a/tensorflow/stream_executor/lib/array_slice.h
+++ b/tensorflow/stream_executor/lib/array_slice.h
@@ -18,14 +18,23 @@ limitations under the License.
 
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::gtl::ArraySlice;
 using tensorflow::gtl::MutableArraySlice;
 
 }  // namespace port
+}  // namespace stream_executor
+
+namespace perftools {
+namespace gputools {
+
+// Temporarily pull stream_executor into perftools::gputools while we migrate
+// code to the new namespace.  TODO(b/77980417): Remove this once we've
+// completed the migration.
+using namespace stream_executor;  // NOLINT[build/namespaces]
+
 }  // namespace gputools
 }  // namespace perftools
 
diff --git a/tensorflow/stream_executor/lib/casts.h b/tensorflow/stream_executor/lib/casts.h
index 2261944e25..ec562e804f 100644
--- a/tensorflow/stream_executor/lib/casts.h
+++ b/tensorflow/stream_executor/lib/casts.h
@@ -13,15 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
+// IWYU pragma: private, include "third_party/tensorflow/stream_executor/stream_executor.h"
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_CASTS_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_CASTS_H_
 
 #include <stdlib.h>
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 // port::bit_cast<Dest,Source> is a template function that implements the
@@ -96,7 +95,6 @@ inline Dest bit_cast(const Source& source) {
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_CASTS_H_
diff --git a/tensorflow/stream_executor/lib/demangle.cc b/tensorflow/stream_executor/lib/demangle.cc
index fa2b4fa005..adb6b4f2d1 100644
--- a/tensorflow/stream_executor/lib/demangle.cc
+++ b/tensorflow/stream_executor/lib/demangle.cc
@@ -27,8 +27,7 @@ limitations under the License.
 #include <cxxabi.h>
 #endif
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 // The API reference of abi::__cxa_demangle() can be found in
@@ -49,5 +48,4 @@ string Demangle(const char *mangled) {
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/lib/demangle.h b/tensorflow/stream_executor/lib/demangle.h
index 30be522557..af16fa7d8c 100644
--- a/tensorflow/stream_executor/lib/demangle.h
+++ b/tensorflow/stream_executor/lib/demangle.h
@@ -18,14 +18,12 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 string Demangle(const char* mangled);
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_DEMANGLE_H_
diff --git a/tensorflow/stream_executor/lib/env.h b/tensorflow/stream_executor/lib/env.h
index c9a22ebd55..776eba0408 100644
--- a/tensorflow/stream_executor/lib/env.h
+++ b/tensorflow/stream_executor/lib/env.h
@@ -21,8 +21,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::Env;
@@ -37,7 +36,6 @@ inline Status FileExists(const port::StringPiece& filename) {
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_ENV_H_
diff --git a/tensorflow/stream_executor/lib/error.h b/tensorflow/stream_executor/lib/error.h
index 89df70cb5e..c659f5fc14 100644
--- a/tensorflow/stream_executor/lib/error.h
+++ b/tensorflow/stream_executor/lib/error.h
@@ -13,21 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
+// IWYU pragma: private, include "third_party/tensorflow/stream_executor/stream_executor.h"
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_ERROR_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_ERROR_H_
 
 #include "tensorflow/core/lib/core/error_codes.pb.h"  // IWYU pragma: export
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 namespace error = tensorflow::error;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_ERROR_H_
diff --git a/tensorflow/stream_executor/lib/human_readable.h b/tensorflow/stream_executor/lib/human_readable.h
index f918c180d9..893865f6da 100644
--- a/tensorflow/stream_executor/lib/human_readable.h
+++ b/tensorflow/stream_executor/lib/human_readable.h
@@ -22,8 +22,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 class HumanReadableNumBytes {
@@ -67,7 +66,6 @@ class HumanReadableNumBytes {
 };
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_HUMAN_READABLE_H_
diff --git a/tensorflow/stream_executor/lib/initialize.h b/tensorflow/stream_executor/lib/initialize.h
index 9a09318a6c..688b021469 100644
--- a/tensorflow/stream_executor/lib/initialize.h
+++ b/tensorflow/stream_executor/lib/initialize.h
@@ -26,8 +26,7 @@ limitations under the License.
 #undef DECLARE_MODULE_INITIALIZER
 #undef REGISTER_MODULE_INITIALIZER_SEQUENCE
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 class Initializer {
@@ -49,20 +48,18 @@ class Initializer {
 };
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
-#define REGISTER_INITIALIZER(type, name, body)                               \
-  static void google_init_##type##_##name() { body; }                        \
-  perftools::gputools::port::Initializer google_initializer_##type##_##name( \
+#define REGISTER_INITIALIZER(type, name, body)                             \
+  static void google_init_##type##_##name() { body; }                      \
+  ::stream_executor::port::Initializer google_initializer_##type##_##name( \
       google_init_##type##_##name)
 
 #define REGISTER_MODULE_INITIALIZER(name, body) \
   REGISTER_INITIALIZER(module, name, body)
 
-#define DECLARE_INITIALIZER(type, name)         \
-  extern perftools::gputools::port::Initializer \
-      google_initializer_##type##_##name
+#define DECLARE_INITIALIZER(type, name) \
+  extern ::stream_executor::port::Initializer google_initializer_##type##_##name
 
 #define DECLARE_MODULE_INITIALIZER(name) DECLARE_INITIALIZER(module, name)
 
diff --git a/tensorflow/stream_executor/lib/inlined_vector.h b/tensorflow/stream_executor/lib/inlined_vector.h
index 55a1e3ad10..40bdddb180 100644
--- a/tensorflow/stream_executor/lib/inlined_vector.h
+++ b/tensorflow/stream_executor/lib/inlined_vector.h
@@ -18,14 +18,12 @@ limitations under the License.
 
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::gtl::InlinedVector;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_INLINED_VECTOR_H_
diff --git a/tensorflow/stream_executor/lib/mathutil.h b/tensorflow/stream_executor/lib/mathutil.h
index e8310d55dd..c225dc5f3c 100644
--- a/tensorflow/stream_executor/lib/mathutil.h
+++ b/tensorflow/stream_executor/lib/mathutil.h
@@ -25,8 +25,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 class MathUtil {
@@ -97,7 +96,6 @@ IntegralType MathUtil::CeilOrFloorOfRatio(IntegralType numerator,
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_MATHUTIL_H_
diff --git a/tensorflow/stream_executor/lib/notification.h b/tensorflow/stream_executor/lib/notification.h
index 9bb3e170dc..472d8c9845 100644
--- a/tensorflow/stream_executor/lib/notification.h
+++ b/tensorflow/stream_executor/lib/notification.h
@@ -18,14 +18,12 @@ limitations under the License.
 
 #include "tensorflow/core/platform/notification.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::Notification;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_NOTIFICATION_H_
diff --git a/tensorflow/stream_executor/lib/numbers.cc b/tensorflow/stream_executor/lib/numbers.cc
index 11a65e198d..b670c42ec8 100644
--- a/tensorflow/stream_executor/lib/numbers.cc
+++ b/tensorflow/stream_executor/lib/numbers.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include <stdlib.h>
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 bool safe_strto32(const char* str, int32* value) {
@@ -38,5 +37,4 @@ bool safe_strto32(const string& str, int32* value) {
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/lib/numbers.h b/tensorflow/stream_executor/lib/numbers.h
index 4a8692b746..2f48281d2d 100644
--- a/tensorflow/stream_executor/lib/numbers.h
+++ b/tensorflow/stream_executor/lib/numbers.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 // Convert strings to floating point values.
@@ -28,7 +27,6 @@ namespace port {
 bool safe_strto32(const string& str, int32* value);
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_NUMBERS_H_
diff --git a/tensorflow/stream_executor/lib/path.cc b/tensorflow/stream_executor/lib/path.cc
index f2591f47f7..56e08c316f 100644
--- a/tensorflow/stream_executor/lib/path.cc
+++ b/tensorflow/stream_executor/lib/path.cc
@@ -16,8 +16,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/path.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 namespace internal {
 
@@ -58,5 +57,4 @@ string JoinPathImpl(std::initializer_list<port::StringPiece> paths) {
 
 }  // namespace internal
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/lib/path.h b/tensorflow/stream_executor/lib/path.h
index 93053dbcb6..325f04ff47 100644
--- a/tensorflow/stream_executor/lib/path.h
+++ b/tensorflow/stream_executor/lib/path.h
@@ -20,8 +20,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::io::Dirname;
@@ -56,7 +55,6 @@ inline string JoinPath(const T&... args) {
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_PATH_H_
diff --git a/tensorflow/stream_executor/lib/process_state.cc b/tensorflow/stream_executor/lib/process_state.cc
index 3d856187f0..72d71e6211 100644
--- a/tensorflow/stream_executor/lib/process_state.cc
+++ b/tensorflow/stream_executor/lib/process_state.cc
@@ -25,8 +25,7 @@ limitations under the License.
 #endif
 #include <memory>
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 string Hostname() {
@@ -54,5 +53,4 @@ bool GetCurrentDirectory(string* dir) {
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/lib/process_state.h b/tensorflow/stream_executor/lib/process_state.h
index 205e726d95..248218c759 100644
--- a/tensorflow/stream_executor/lib/process_state.h
+++ b/tensorflow/stream_executor/lib/process_state.h
@@ -18,15 +18,13 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 string Hostname();
 bool GetCurrentDirectory(string* dir);
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_PROCESS_STATE_H_
diff --git a/tensorflow/stream_executor/lib/ptr_util.h b/tensorflow/stream_executor/lib/ptr_util.h
index 3d5e56faf7..3f89794688 100644
--- a/tensorflow/stream_executor/lib/ptr_util.h
+++ b/tensorflow/stream_executor/lib/ptr_util.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include <memory>
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 // Trait to select overloads and return types for MakeUnique.
@@ -59,8 +58,17 @@ typename MakeUniqueResult<T>::invalid MakeUnique(Args&&... /* args */) =
     delete;  // NOLINT
 
 }  // namespace port
+}  // namespace stream_executor
+
+namespace perftools {
+namespace gputools {
+
+// Temporarily pull stream_executor into perftools::gputools while we migrate
+// code to the new namespace.  TODO(jlebar): Remove this once we've completed
+// the migration.
+using namespace stream_executor;  // NOLINT[build/namespaces]
+
 }  // namespace gputools
 }  // namespace perftools
 
-
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_PTR_UTIL_H_
diff --git a/tensorflow/stream_executor/lib/stacktrace.h b/tensorflow/stream_executor/lib/stacktrace.h
index ba7e5317f0..a15b0f3026 100644
--- a/tensorflow/stream_executor/lib/stacktrace.h
+++ b/tensorflow/stream_executor/lib/stacktrace.h
@@ -19,14 +19,12 @@ limitations under the License.
 #include "tensorflow/core/platform/stacktrace.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::CurrentStackTrace;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STACKTRACE_H_
diff --git a/tensorflow/stream_executor/lib/status.h b/tensorflow/stream_executor/lib/status.h
index 8c289e1927..407b71b405 100644
--- a/tensorflow/stream_executor/lib/status.h
+++ b/tensorflow/stream_executor/lib/status.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
+// IWYU pragma: private, include "third_party/tensorflow/stream_executor/stream_executor.h"
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATUS_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_STATUS_H_
@@ -23,15 +23,14 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using Status = tensorflow::Status;
 
 #define SE_CHECK_OK(val) TF_CHECK_OK(val)
 #define SE_ASSERT_OK(val) \
-  ASSERT_EQ(::perftools::gputools::port::Status::OK(), (val))
+  ASSERT_EQ(::stream_executor::port::Status::OK(), (val))
 
 // Define some canonical error helpers.
 inline Status UnimplementedError(StringPiece message) {
@@ -45,6 +44,16 @@ inline Status FailedPreconditionError(StringPiece message) {
 }
 
 }  // namespace port
+}  // namespace stream_executor
+
+namespace perftools {
+namespace gputools {
+
+// Temporarily pull stream_executor into perftools::gputools while we migrate
+// code to the new namespace.  TODO(b/77980417): Remove this once we've
+// completed the migration.
+using namespace stream_executor;  // NOLINT[build/namespaces]
+
 }  // namespace gputools
 }  // namespace perftools
 
diff --git a/tensorflow/stream_executor/lib/statusor.h b/tensorflow/stream_executor/lib/statusor.h
index 3b97929b37..dab5909674 100644
--- a/tensorflow/stream_executor/lib/statusor.h
+++ b/tensorflow/stream_executor/lib/statusor.h
@@ -13,15 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
+// IWYU pragma: private, include "third_party/tensorflow/stream_executor/stream_executor.h"
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_
 
 #include "tensorflow/compiler/xla/statusor.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 // Use XLA's StatusOr so we don't duplicate code.
@@ -29,7 +28,6 @@ template <typename T>
 using StatusOr = ::xla::StatusOr<T>;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_
diff --git a/tensorflow/stream_executor/lib/str_util.h b/tensorflow/stream_executor/lib/str_util.h
index 5dd3d06aff..a81c666818 100644
--- a/tensorflow/stream_executor/lib/str_util.h
+++ b/tensorflow/stream_executor/lib/str_util.h
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/stream_executor/lib/stringpiece.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::str_util::Join;
@@ -38,7 +37,6 @@ inline string StripSuffixString(port::StringPiece str, port::StringPiece suffix)
 using tensorflow::str_util::Lowercase;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STR_UTIL_H_
diff --git a/tensorflow/stream_executor/lib/strcat.h b/tensorflow/stream_executor/lib/strcat.h
index 424cb75f0e..c959e4df5b 100644
--- a/tensorflow/stream_executor/lib/strcat.h
+++ b/tensorflow/stream_executor/lib/strcat.h
@@ -13,22 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
+// IWYU pragma: private, include "third_party/tensorflow/stream_executor/stream_executor.h"
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STRCAT_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_STRCAT_H_
 
 #include "tensorflow/core/lib/strings/strcat.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::strings::StrCat;
 using tensorflow::strings::StrAppend;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STRCAT_H_
diff --git a/tensorflow/stream_executor/lib/stringpiece.h b/tensorflow/stream_executor/lib/stringpiece.h
index 97ee0c9206..b80de5df30 100644
--- a/tensorflow/stream_executor/lib/stringpiece.h
+++ b/tensorflow/stream_executor/lib/stringpiece.h
@@ -19,14 +19,12 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::StringPiece;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STRINGPIECE_H_
diff --git a/tensorflow/stream_executor/lib/stringprintf.h b/tensorflow/stream_executor/lib/stringprintf.h
index 504de25a68..2f65ed9c6a 100644
--- a/tensorflow/stream_executor/lib/stringprintf.h
+++ b/tensorflow/stream_executor/lib/stringprintf.h
@@ -18,15 +18,13 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/stringprintf.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::strings::Printf;
 using tensorflow::strings::Appendf;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STRINGPRINTF_H_
diff --git a/tensorflow/stream_executor/lib/thread_options.h b/tensorflow/stream_executor/lib/thread_options.h
index bd7f63714e..079cf757ac 100644
--- a/tensorflow/stream_executor/lib/thread_options.h
+++ b/tensorflow/stream_executor/lib/thread_options.h
@@ -18,14 +18,12 @@ limitations under the License.
 
 #include "tensorflow/core/platform/env.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::ThreadOptions;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_THREAD_OPTIONS_H_
diff --git a/tensorflow/stream_executor/lib/threadpool.h b/tensorflow/stream_executor/lib/threadpool.h
index 35630c5106..220068ade1 100644
--- a/tensorflow/stream_executor/lib/threadpool.h
+++ b/tensorflow/stream_executor/lib/threadpool.h
@@ -21,14 +21,12 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/notification.h"
 #include "tensorflow/stream_executor/lib/thread_options.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::thread::ThreadPool;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_THREADPOOL_H_
diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc
index f9f3737a06..5b51398d8c 100644
--- a/tensorflow/stream_executor/multi_platform_manager.cc
+++ b/tensorflow/stream_executor/multi_platform_manager.cc
@@ -20,8 +20,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/str_util.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 /* static */ mutex MultiPlatformManager::platforms_mutex_{LINKER_INITIALIZED};
 
@@ -132,8 +131,7 @@ MultiPlatformManager::InitializePlatformWithId(
   GetPlatformByIdMap()->clear();
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 REGISTER_MODULE_INITIALIZER(
     multi_platform_manager,
diff --git a/tensorflow/stream_executor/multi_platform_manager.h b/tensorflow/stream_executor/multi_platform_manager.h
index 438653ee20..672855d5fb 100644
--- a/tensorflow/stream_executor/multi_platform_manager.h
+++ b/tensorflow/stream_executor/multi_platform_manager.h
@@ -22,8 +22,8 @@ limitations under the License.
 // In your BUILD rule, add a dependency on a platform plugin that you'd like
 // to use, such as:
 //
-//   //perftools/gputools/executor/cuda:cuda_platform
-//   //perftools/gputools/executor/opencl:opencl_platform
+//   //third_party/tensorflow/stream_executor/cuda:cuda_platform
+//   //third_party/tensorflow/stream_executor/opencl:opencl_platform
 //
 // This will register platform plugins that can be discovered via this
 // interface. Sample API usage:
@@ -56,10 +56,10 @@ limitations under the License.
 // And similarly, for standard interfaces (BLAS, RNG, etc.) you can add
 // dependencies on support libraries, e.g.:
 //
-//    //perftools/gputools/executor/cuda:pluton_blas_plugin
-//    //perftools/gputools/executor/cuda:cudnn_plugin
-//    //perftools/gputools/executor/cuda:cublas_plugin
-//    //perftools/gputools/executor/cuda:curand_plugin
+//    //third_party/tensorflow/stream_executor/cuda:pluton_blas_plugin
+//    //third_party/tensorflow/stream_executor/cuda:cudnn_plugin
+//    //third_party/tensorflow/stream_executor/cuda:cublas_plugin
+//    //third_party/tensorflow/stream_executor/cuda:curand_plugin
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_
 #define TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_
@@ -75,8 +75,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Manages multiple platforms that may be present on the current machine.
 class MultiPlatformManager {
@@ -181,7 +180,6 @@ class MultiPlatformManager {
   SE_DISALLOW_COPY_AND_ASSIGN(MultiPlatformManager);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_
diff --git a/tensorflow/stream_executor/platform.cc b/tensorflow/stream_executor/platform.cc
index 4cdc22bd16..777abced86 100644
--- a/tensorflow/stream_executor/platform.cc
+++ b/tensorflow/stream_executor/platform.cc
@@ -22,8 +22,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 string PlatformKindString(PlatformKind kind) {
   switch (kind) {
@@ -135,5 +134,4 @@ port::Status Platform::EnablePeerAccess() {
   return port::Status::OK();
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/platform.h b/tensorflow/stream_executor/platform.h
index 54f8aa86c2..5cb7047b6f 100644
--- a/tensorflow/stream_executor/platform.h
+++ b/tensorflow/stream_executor/platform.h
@@ -29,8 +29,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/plugin.h"
 #include "tensorflow/stream_executor/trace_listener.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class StreamExecutor;
 
@@ -106,7 +105,7 @@ class Platform {
   namespace {                           \
   int plugin_id_value;                  \
   }                                     \
-  const perftools::gputools::Platform::Id ID_VAR_NAME = &plugin_id_value;
+  const ::stream_executor::Platform::Id ID_VAR_NAME = &plugin_id_value;
 
   // Returns a key uniquely identifying this platform.
   virtual Id id() const = 0;
@@ -205,7 +204,6 @@ class Platform {
   SE_DISALLOW_COPY_AND_ASSIGN(Platform);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_H_
diff --git a/tensorflow/stream_executor/platform/default/mutex.h b/tensorflow/stream_executor/platform/default/mutex.h
index 62de0cbce0..c9f5a7c609 100644
--- a/tensorflow/stream_executor/platform/default/mutex.h
+++ b/tensorflow/stream_executor/platform/default/mutex.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/mutex.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 #undef mutex_lock
 #undef tf_shared_lock
@@ -35,7 +34,6 @@ using tensorflow::tf_shared_lock;
 #define tf_shared_lock(x) \
   static_assert(0, "tf_shared_lock_decl_missing_var_name");
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_MUTEX_H_
diff --git a/tensorflow/stream_executor/platform/port.h b/tensorflow/stream_executor/platform/port.h
index 6603df4878..259cf380d6 100644
--- a/tensorflow/stream_executor/platform/port.h
+++ b/tensorflow/stream_executor/platform/port.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
+// IWYU pragma: private, include "third_party/tensorflow/stream_executor/stream_executor.h"
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PORT_H_
 #define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PORT_H_
@@ -22,8 +22,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 using tensorflow::int8;
 using tensorflow::int16;
@@ -50,8 +49,7 @@ using tensorflow::LINKER_INITIALIZED;
 
 #define SE_FALLTHROUGH_INTENDED TF_FALLTHROUGH_INTENDED
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #define SE_DISALLOW_COPY_AND_ASSIGN TF_DISALLOW_COPY_AND_ASSIGN
 #define SE_MUST_USE_RESULT TF_MUST_USE_RESULT
diff --git a/tensorflow/stream_executor/plugin.cc b/tensorflow/stream_executor/plugin.cc
index 6424658e22..cfbc52ff17 100644
--- a/tensorflow/stream_executor/plugin.cc
+++ b/tensorflow/stream_executor/plugin.cc
@@ -15,8 +15,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/plugin.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Mostly-arbitrary ID only used as a sentinel "not otherwise initialized"
 // value. This value should never [need to] be specified aside by initialization
@@ -51,5 +50,4 @@ PluginConfig& PluginConfig::SetRng(PluginId rng) {
   return *this;
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/plugin.h b/tensorflow/stream_executor/plugin.h
index 0b88b86e2b..0505412e7a 100644
--- a/tensorflow/stream_executor/plugin.h
+++ b/tensorflow/stream_executor/plugin.h
@@ -16,8 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_PLUGIN_H_
 #define TENSORFLOW_STREAM_EXECUTOR_PLUGIN_H_
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // A plugin ID is a unique identifier for each registered plugin type.
 typedef void* PluginId;
@@ -83,7 +82,6 @@ class PluginConfig {
   PluginId blas_, dnn_, fft_, rng_;
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_PLUGIN_H_
diff --git a/tensorflow/stream_executor/plugin_registry.cc b/tensorflow/stream_executor/plugin_registry.cc
index 54761139ea..7812703efd 100644
--- a/tensorflow/stream_executor/plugin_registry.cc
+++ b/tensorflow/stream_executor/plugin_registry.cc
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 const PluginId kNullPlugin = nullptr;
 
@@ -244,5 +243,4 @@ EMIT_PLUGIN_SPECIALIZATIONS(DnnFactory, dnn, "DNN");
 EMIT_PLUGIN_SPECIALIZATIONS(FftFactory, fft, "FFT");
 EMIT_PLUGIN_SPECIALIZATIONS(RngFactory, rng, "RNG");
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/plugin_registry.h b/tensorflow/stream_executor/plugin_registry.h
index 8636a49ce6..49628ecd24 100644
--- a/tensorflow/stream_executor/plugin_registry.h
+++ b/tensorflow/stream_executor/plugin_registry.h
@@ -28,8 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/plugin.h"
 #include "tensorflow/stream_executor/rng.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 namespace internal {
 class StreamExecutorInterface;
@@ -160,7 +159,6 @@ class PluginRegistry {
   SE_DISALLOW_COPY_AND_ASSIGN(PluginRegistry);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_PLUGIN_REGISTRY_H_
diff --git a/tensorflow/stream_executor/rng.cc b/tensorflow/stream_executor/rng.cc
index 1c05005067..b0efad9108 100644
--- a/tensorflow/stream_executor/rng.cc
+++ b/tensorflow/stream_executor/rng.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/logging.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace rng {
 
 bool RngSupport::CheckSeed(const uint8 *seed, uint64 seed_bytes) {
@@ -47,5 +46,4 @@ const int RngSupport::kMaxSeedBytes;
 #endif
 
 }  // namespace rng
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rng.h b/tensorflow/stream_executor/rng.h
index 36d0fdd454..acbf8fce4c 100644
--- a/tensorflow/stream_executor/rng.h
+++ b/tensorflow/stream_executor/rng.h
@@ -22,8 +22,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 template <typename ElemT>
@@ -89,7 +88,6 @@ class RngSupport {
 };
 
 }  // namespace rng
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_RNG_H_
diff --git a/tensorflow/stream_executor/scratch_allocator.cc b/tensorflow/stream_executor/scratch_allocator.cc
index 0c1db414f2..8fc4c4c509 100644
--- a/tensorflow/stream_executor/scratch_allocator.cc
+++ b/tensorflow/stream_executor/scratch_allocator.cc
@@ -18,8 +18,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status_macros.h"
 #include "tensorflow/stream_executor/stream.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 ScratchAllocator::~ScratchAllocator() {}
 
@@ -38,5 +37,4 @@ port::StatusOr<DeviceMemory<uint8>> OneTimeScratchAllocator::AllocateBytes(
   return temporary_->device_memory();
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/scratch_allocator.h b/tensorflow/stream_executor/scratch_allocator.h
index 94d5ede161..2aed2c4437 100644
--- a/tensorflow/stream_executor/scratch_allocator.h
+++ b/tensorflow/stream_executor/scratch_allocator.h
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/temporary_device_memory.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 
@@ -77,7 +76,6 @@ class OneTimeScratchAllocator : public ScratchAllocator {
   SE_DISALLOW_COPY_AND_ASSIGN(OneTimeScratchAllocator);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_SCRATCH_ALLOCATOR_H_
diff --git a/tensorflow/stream_executor/shared_memory_config.h b/tensorflow/stream_executor/shared_memory_config.h
index de556cb734..7cbeb3bcd9 100644
--- a/tensorflow/stream_executor/shared_memory_config.h
+++ b/tensorflow/stream_executor/shared_memory_config.h
@@ -19,8 +19,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_
 #define TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // SharedMemoryConfig enum describes potential widths of shared memory banks for
 // a device or kernel.
@@ -30,7 +29,6 @@ enum class SharedMemoryConfig {
   kEightByte,  // Sets shared memory banks to be eight bytes wide.
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index fe498507a8..f59d9a13ac 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -28,8 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 namespace {
 // Code to turn parameters to functions on stream into strings that
@@ -5192,5 +5191,4 @@ port::Status Stream::BlockHostUntilDone() {
   return first_error;
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index 4af426001f..d4a81440e9 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -38,8 +38,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/temporary_memory_manager.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 namespace host {
 class HostBlas;
@@ -2098,7 +2097,6 @@ struct Quantization<int32> {
       dnn::QuantizedActivationMode::k32Bit;
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_STREAM_H_
diff --git a/tensorflow/stream_executor/stream_executor.h b/tensorflow/stream_executor/stream_executor.h
index 2995dccf46..d63d485df5 100644
--- a/tensorflow/stream_executor/stream_executor.h
+++ b/tensorflow/stream_executor/stream_executor.h
@@ -35,4 +35,15 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"  // IWYU pragma: export
 #include "tensorflow/stream_executor/timer.h"  // IWYU pragma: export
 
+namespace perftools {
+namespace gputools {
+
+// Temporarily pull stream_executor into perftools::gputools while we migrate
+// code to the new namespace.  TODO(b/77980417): Remove this once we've
+// completed the migration.
+using namespace stream_executor;  // NOLINT[build/namespaces]
+
+}  // namespace gputools
+}  // namespace perftools
+
 #endif  // TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
diff --git a/tensorflow/stream_executor/stream_executor_internal.cc b/tensorflow/stream_executor/stream_executor_internal.cc
index 273d970b6f..8297228e6f 100644
--- a/tensorflow/stream_executor/stream_executor_internal.cc
+++ b/tensorflow/stream_executor/stream_executor_internal.cc
@@ -15,8 +15,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace internal {
 
 // -- CUDA
@@ -38,5 +37,4 @@ StreamExecutorFactory* MakeOpenCLExecutorImplementation() {
 StreamExecutorFactory MakeHostExecutorImplementation;
 
 }  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 37ef182e14..2584c92f0c 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -45,8 +45,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/trace_listener.h"
 #include "tensorflow/stream_executor/lib/inlined_vector.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 class Timer;
@@ -343,7 +342,6 @@ extern StreamExecutorFactory MakeHostExecutorImplementation;
 
 
 }  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index f55fa68402..2e1adeb31e 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -39,8 +39,7 @@ namespace {
 bool FLAGS_check_device_leaks = false;
 }  // namespace
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace {
 
 string StackTraceIfVLOG10() {
@@ -788,5 +787,4 @@ internal::StreamExecutorInterface *StreamExecutor::implementation() {
   return implementation_->GetUnderlyingExecutor();
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index 69d0374d73..39af7115d8 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -37,8 +37,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/trace_listener.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Structure used for device memory leak checking.
 struct AllocRecord {
@@ -95,7 +94,7 @@ class StreamExecutor {
   // Parameters:
   //   spec: The MultiKernelLoaderSpec is usually generated as a compile-time
   //    constant into an appropriate namespace. For example, see
-  //    perftools::gputools::executor_sample::kKernelLoaderSpecs, from which a
+  //    stream_executor::executor_sample::kKernelLoaderSpecs, from which a
   //    MultiKernelLoaderSpec is selected.
   //   kernel: Outparam that the kernel is loaded into. A given Kernel
   //    instantiation should not be loaded into more than once.
@@ -803,7 +802,6 @@ inline Stream &Stream::ThenLaunch(ThreadDim thread_dims, BlockDim block_dims,
   return *this;
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_PIMPL_H_
diff --git a/tensorflow/stream_executor/temporary_device_memory.cc b/tensorflow/stream_executor/temporary_device_memory.cc
index c33166b224..f113ce9be5 100644
--- a/tensorflow/stream_executor/temporary_device_memory.cc
+++ b/tensorflow/stream_executor/temporary_device_memory.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/stream.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 TemporaryDeviceMemoryBase::~TemporaryDeviceMemoryBase() {
   parent_->temporary_memory_manager()->MarkFinalized(device_memory_,
@@ -64,5 +63,4 @@ TemporaryDeviceMemoryBase::TemporaryDeviceMemoryBase(
   DCHECK(IsAllocated());
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/temporary_device_memory.h b/tensorflow/stream_executor/temporary_device_memory.h
index 2255e7ffd7..77be8599a2 100644
--- a/tensorflow/stream_executor/temporary_device_memory.h
+++ b/tensorflow/stream_executor/temporary_device_memory.h
@@ -43,8 +43,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/device_memory.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 namespace internal {
@@ -132,7 +131,6 @@ class TemporaryDeviceMemory : public TemporaryDeviceMemoryBase {
   }
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TEMPORARY_DEVICE_MEMORY_H_
diff --git a/tensorflow/stream_executor/temporary_memory_manager.cc b/tensorflow/stream_executor/temporary_memory_manager.cc
index 449ab7d3f0..420dbb0933 100644
--- a/tensorflow/stream_executor/temporary_memory_manager.cc
+++ b/tensorflow/stream_executor/temporary_memory_manager.cc
@@ -21,8 +21,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace internal {
 
 void TemporaryMemoryManager::ForceDeallocateAll() {
@@ -124,5 +123,4 @@ TemporaryMemoryManager::AllocateArrayBase(uint64 element_count,
 }
 
 }  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/temporary_memory_manager.h b/tensorflow/stream_executor/temporary_memory_manager.h
index 2e6fbd9d62..faf13380dc 100644
--- a/tensorflow/stream_executor/temporary_memory_manager.h
+++ b/tensorflow/stream_executor/temporary_memory_manager.h
@@ -31,8 +31,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/temporary_device_memory.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace internal {
 
 // Record used inside the TemporaryMemoryManager as metadata for a given device
@@ -147,7 +146,6 @@ TemporaryMemoryManager::AllocateArray(uint64 element_count) {
 }
 
 }  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TEMPORARY_MEMORY_MANAGER_H_
diff --git a/tensorflow/stream_executor/timer.cc b/tensorflow/stream_executor/timer.cc
index 41d7e4359d..a29791a104 100644
--- a/tensorflow/stream_executor/timer.cc
+++ b/tensorflow/stream_executor/timer.cc
@@ -21,8 +21,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 Timer::Timer(StreamExecutor *parent)
     : parent_(parent),
@@ -34,5 +33,4 @@ uint64 Timer::Microseconds() const { return implementation_->Microseconds(); }
 
 uint64 Timer::Nanoseconds() const { return implementation_->Nanoseconds(); }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/timer.h b/tensorflow/stream_executor/timer.h
index 0a37caa0f2..fba7dd8f58 100644
--- a/tensorflow/stream_executor/timer.h
+++ b/tensorflow/stream_executor/timer.h
@@ -20,8 +20,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 namespace internal {
 class TimerInterface;
@@ -69,7 +68,6 @@ class Timer {
   SE_DISALLOW_COPY_AND_ASSIGN(Timer);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TIMER_H_
diff --git a/tensorflow/stream_executor/trace_listener.h b/tensorflow/stream_executor/trace_listener.h
index d1e87c348b..0e874a1d47 100644
--- a/tensorflow/stream_executor/trace_listener.h
+++ b/tensorflow/stream_executor/trace_listener.h
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/launch_dim.h"
 #include "tensorflow/stream_executor/lib/status.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 
@@ -69,7 +68,6 @@ class TraceListener {
                                           const port::Status* result) {}
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TRACE_LISTENER_H_
-- 
GitLab


From 495d511bf384e296d7149537bc0900c32e0b76b5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 14:33:53 -0700
Subject: [PATCH 043/450] Use easy_install to update pip only on Ubuntu14.04

* We only depends on easy_install/easy_install3 to update pip on Ubuntu14.04
* They are not always available for later system e.g. debian9
* We can use pip/pip3 to update themselves

PiperOrigin-RevId: 193257326
---
 .../tools/ci_build/install/install_pip_packages.sh  | 13 +++++++++----
 .../ci_build/install/install_pip_packages_remote.sh |  6 ++----
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index d406b83a62..fc137aeeed 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -16,10 +16,15 @@
 
 set -e
 
-# We don't apt-get install so that we can install a newer version of pip. Not
-# needed after we upgrade to Ubuntu 16.04
-easy_install -U pip
-easy_install3 -U pip
+# We don't apt-get install so that we can install a newer version of pip.
+# Only needed for Ubuntu 14.04 ,and not needed for Ubuntu 16.04 / Debian 8,9
+if $(cat /etc/*-release | grep -q 14.04); then
+  easy_install -U pip
+  easy_install3 -U pip
+else
+  pip2 install --upgrade pip
+  pip3 install --upgrade pip
+fi
 
 # Install pip packages from whl files to avoid the time-consuming process of
 # building from source.
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages_remote.sh b/tensorflow/tools/ci_build/install/install_pip_packages_remote.sh
index 39a6d557d1..0beabcf5ef 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages_remote.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages_remote.sh
@@ -20,10 +20,8 @@ if [ ! -f /usr/bin/x86_64-linux-gnu-gcc ]; then
   ln -s /usr/local/bin/clang /usr/bin/x86_64-linux-gnu-gcc
 fi
 
-pip2 install -U pip
-pip3 install -U pip
-pip2  install -U setuptools
-pip3 install -U setuptools
+pip2 install --upgrade setuptools
+pip3 install --upgrade setuptools
 
 # The rest of the pip packages will be installed in
 # `install_pip_packages.sh`
-- 
GitLab


From 8a2eb27d7bbb552e2375c4fafa1863e017c503be Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Tue, 17 Apr 2018 14:47:44 -0700
Subject: [PATCH 044/450] Don't consider control flow edges when computing
 switch depth of switch.

PiperOrigin-RevId: 193259710
---
 tensorflow/compiler/tf2xla/functionalize_control_flow.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 16b9142cbf..23629d85ae 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -870,6 +870,9 @@ FunctionalizeCond::DeterminePredicateSwitchOrder() {
       // Merge the inputs of the switch node with one another. This results in
       // predicates and control input residing in the same cluster.
       for (const Edge* e : n->in_edges()) {
+        // Only consider the data inputs to the Switch node.
+        if (e->IsControlEdge()) continue;
+
         Node* src = e->src();
         UnionFind<Cluster>* src_cluster = find_output_cluster(src);
         int src_cluster_depth = switch_depth[src_cluster->Get().representative];
-- 
GitLab


From 8670a5e23717a8740d1360d34147f90fdf0b3b68 Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Tue, 17 Apr 2018 15:01:22 -0700
Subject: [PATCH 045/450] Internal Change.

PiperOrigin-RevId: 193262066
---
 tensorflow/tools/ci_build/windows/bazel/common_env.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index 7d4cc7ac30..0e6c0227b7 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -44,6 +44,8 @@ export PYTHON_LIB_PATH="C:/${PYTHON_BASE_PATH}/lib/site-packages"
 # Add python into PATH, it's needed because gen_git_source.py uses
 # '/usr/bin/env python' as a shebang
 export PATH="/c/${PYTHON_BASE_PATH}:$PATH"
+# Add git into PATH needed for gen_git_source.py
+export PATH="/c/Program Files/Git/cmd:$PATH"
 
 # Make sure we have pip in PATH
 export PATH="/c/${PYTHON_BASE_PATH}/Scripts:$PATH"
-- 
GitLab


From ba1ea3ff90ee44c8e82a1fb9ba757d798b55d144 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Mon, 16 Apr 2018 11:24:43 -0700
Subject: [PATCH 046/450] Porting tests for the `decode_proto` and
 `encode_proto` to OS.

PiperOrigin-RevId: 193070420
---
 tensorflow/contrib/BUILD                      |   1 +
 tensorflow/contrib/__init__.py                |   1 +
 tensorflow/contrib/cmake/tf_python.cmake      |   6 +-
 tensorflow/contrib/proto/BUILD                |  16 +
 .../contrib/proto/python/kernel_tests/BUILD   |  86 +++++
 .../proto/python/kernel_tests/build_defs.bzl  |  89 ++++++
 .../kernel_tests/decode_proto_fail_test.py    |  68 ++++
 .../kernel_tests/decode_proto_op_test.py      | 300 ++++++++++++++++++
 .../kernel_tests/encode_proto_op_test.py      | 180 +++++++++++
 .../python/kernel_tests/minmax.TestCase.pbtxt | 161 ++++++++++
 .../python/kernel_tests/nested.TestCase.pbtxt |  16 +
 .../kernel_tests/optional.TestCase.pbtxt      |  20 ++
 .../promote_unsigned.TestCase.pbtxt           |  21 ++
 .../python/kernel_tests/ragged.TestCase.pbtxt |  32 ++
 .../kernel_tests/shaped_batch.TestCase.pbtxt  |  62 ++++
 .../python/kernel_tests/simple.TestCase.pbtxt |  21 ++
 .../proto/python/kernel_tests/test_case.py    |  35 ++
 .../python/kernel_tests/test_example.proto    | 149 +++++++++
 tensorflow/tools/pip_package/BUILD            |   1 +
 19 files changed, 1263 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/BUILD
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/test_case.py
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/test_example.proto

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 9bef0d8b61..ae68f4aec4 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -77,6 +77,7 @@ py_library(
         "//tensorflow/contrib/optimizer_v2:optimizer_v2_py",
         "//tensorflow/contrib/periodic_resample:init_py",
         "//tensorflow/contrib/predictor",
+        "//tensorflow/contrib/proto",
         "//tensorflow/contrib/quantization:quantization_py",
         "//tensorflow/contrib/quantize:quantize_graph",
         "//tensorflow/contrib/autograph",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index aaddb06fa0..e27ece8fa5 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -64,6 +64,7 @@ from tensorflow.contrib import nn
 from tensorflow.contrib import opt
 from tensorflow.contrib import periodic_resample
 from tensorflow.contrib import predictor
+from tensorflow.contrib import proto
 from tensorflow.contrib import quantization
 from tensorflow.contrib import quantize
 from tensorflow.contrib import recurrent
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index ded15b4b66..21f59d2563 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -330,8 +330,10 @@ GENERATE_PYTHON_OP_LIB("ctc_ops")
 GENERATE_PYTHON_OP_LIB("cudnn_rnn_ops")
 GENERATE_PYTHON_OP_LIB("data_flow_ops")
 GENERATE_PYTHON_OP_LIB("dataset_ops")
-GENERATE_PYTHON_OP_LIB("decode_proto_ops")
-GENERATE_PYTHON_OP_LIB("encode_proto_ops")
+GENERATE_PYTHON_OP_LIB("decode_proto_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/proto/python/ops/gen_decode_proto_op.py)
+GENERATE_PYTHON_OP_LIB("encode_proto_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/proto/python/ops/gen_encode_proto_op.py)
 GENERATE_PYTHON_OP_LIB("image_ops")
 GENERATE_PYTHON_OP_LIB("io_ops")
 GENERATE_PYTHON_OP_LIB("linalg_ops")
diff --git a/tensorflow/contrib/proto/BUILD b/tensorflow/contrib/proto/BUILD
index 046652cbc5..3e9b1a0b8d 100644
--- a/tensorflow/contrib/proto/BUILD
+++ b/tensorflow/contrib/proto/BUILD
@@ -4,6 +4,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
 py_library(
     name = "proto",
     srcs = [
@@ -14,3 +16,17 @@ py_library(
         "//tensorflow/contrib/proto/python/ops:encode_proto_op_py",
     ],
 )
+
+py_library(
+    name = "proto_pip",
+    data = [
+        "//tensorflow/contrib/proto/python/kernel_tests:test_messages",
+    ] + if_static(
+        [],
+        otherwise = ["//tensorflow/contrib/proto/python/kernel_tests:libtestexample.so"],
+    ),
+    deps = [
+        ":proto",
+        "//tensorflow/contrib/proto/python/kernel_tests:py_test_deps",
+    ],
+)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/BUILD b/tensorflow/contrib/proto/python/kernel_tests/BUILD
new file mode 100644
index 0000000000..a380a131f8
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/BUILD
@@ -0,0 +1,86 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+# Much of the work in this BUILD file actually happens in the corresponding
+# build_defs.bzl, which creates an individual testcase for each example .pbtxt
+# file in this directory.
+#
+load(":build_defs.bzl", "decode_proto_test_suite")
+load(":build_defs.bzl", "encode_proto_test_suite")
+
+# This expands to a tf_py_test for each test file.
+# It defines the test_suite :decode_proto_op_tests.
+decode_proto_test_suite(
+    name = "decode_proto_tests",
+    examples = glob(["*.pbtxt"]),
+)
+
+# This expands to a tf_py_test for each test file.
+# It defines the test_suite :encode_proto_op_tests.
+encode_proto_test_suite(
+    name = "encode_proto_tests",
+    examples = glob(["*.pbtxt"]),
+)
+
+# Below here are tests that are not tied to an example text proto.
+filegroup(
+    name = "test_messages",
+    srcs = glob(["*.pbtxt"]),
+)
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+
+tf_py_test(
+    name = "decode_proto_fail_test",
+    size = "small",
+    srcs = ["decode_proto_fail_test.py"],
+    additional_deps = [
+        ":py_test_deps",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/proto:proto",
+        "//tensorflow/contrib/proto/python/ops:decode_proto_op_py",
+    ],
+    data = if_static(
+        [],
+        otherwise = [":libtestexample.so"],
+    ),
+    tags = [
+        "no_pip",  # TODO(b/78026780)
+        "no_windows",  # TODO(b/78028010)
+    ],
+)
+
+py_library(
+    name = "test_case",
+    srcs = ["test_case.py"],
+    deps = ["//tensorflow/python:client_testlib"],
+)
+
+py_library(
+    name = "py_test_deps",
+    deps = [
+        ":test_case",
+        ":test_example_proto_py",
+    ],
+)
+
+tf_proto_library(
+    name = "test_example_proto",
+    srcs = ["test_example.proto"],
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/core:protos_all"],
+)
+
+tf_cc_shared_object(
+    name = "libtestexample.so",
+    linkstatic = 1,
+    deps = [
+        ":test_example_proto_cc",
+    ],
+)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl b/tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl
new file mode 100644
index 0000000000..f425601691
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl
@@ -0,0 +1,89 @@
+"""BUILD rules for generating file-driven proto test cases.
+
+The decode_proto_test_suite() and encode_proto_test_suite() rules take a list
+of text protos and generates a tf_py_test() for each one.
+"""
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "register_extension_info")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
+def _test_name(test, path):
+  return "%s_%s_test" % (test, path.split("/")[-1].split(".")[0])
+
+def decode_proto_test_suite(name, examples):
+  """Build the decode_proto py_test for each test filename."""
+  for test_filename in examples:
+    tf_py_test(
+        name = _test_name("decode_proto", test_filename),
+        srcs = ["decode_proto_op_test.py"],
+        size = "small",
+        data = [test_filename] + if_static(
+            [],
+            otherwise = [":libtestexample.so"],
+        ),
+        main = "decode_proto_op_test.py",
+        args = [
+            "--message_text_file=\"%s/%s\"" % (native.package_name(), test_filename),
+        ],
+        additional_deps = [
+            ":py_test_deps",
+            "//third_party/py/numpy",
+            "//tensorflow/contrib/proto:proto",
+            "//tensorflow/contrib/proto/python/ops:decode_proto_op_py",
+        ],
+        tags = [
+            "no_pip",  # TODO(b/78026780)
+            "no_windows",  # TODO(b/78028010)
+        ],
+    )
+  native.test_suite(
+      name = name,
+      tests = [":" + _test_name("decode_proto", test_filename)
+               for test_filename in examples],
+  )
+
+def encode_proto_test_suite(name, examples):
+  """Build the encode_proto py_test for each test filename."""
+  for test_filename in examples:
+    tf_py_test(
+        name = _test_name("encode_proto", test_filename),
+        srcs = ["encode_proto_op_test.py"],
+        size = "small",
+        data = [test_filename] + if_static(
+            [],
+            otherwise = [":libtestexample.so"],
+        ),
+        main = "encode_proto_op_test.py",
+        args = [
+            "--message_text_file=\"%s/%s\"" % (native.package_name(), test_filename),
+        ],
+        additional_deps = [
+            ":py_test_deps",
+            "//third_party/py/numpy",
+            "//tensorflow/contrib/proto:proto",
+            "//tensorflow/contrib/proto/python/ops:decode_proto_op_py",
+            "//tensorflow/contrib/proto/python/ops:encode_proto_op_py",
+        ],
+        tags = [
+            "no_pip",  # TODO(b/78026780)
+            "no_windows",  # TODO(b/78028010)
+        ],
+    )
+  native.test_suite(
+      name = name,
+      tests = [":" + _test_name("encode_proto", test_filename)
+               for test_filename in examples],
+  )
+
+register_extension_info(
+    extension_name = "decode_proto_test_suite",
+    label_regex_map = {
+        "deps": "deps:decode_example_.*",
+    })
+
+register_extension_info(
+    extension_name = "encode_proto_test_suite",
+    label_regex_map = {
+        "deps": "deps:encode_example_.*",
+    })
diff --git a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
new file mode 100644
index 0000000000..5298342ee7
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
@@ -0,0 +1,68 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# Python3 preparedness imports.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.proto.python.kernel_tests import test_case
+from tensorflow.contrib.proto.python.ops import decode_proto_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+
+
+class DecodeProtoFailTest(test_case.ProtoOpTestCase):
+  """Test failure cases for DecodeToProto."""
+
+  def _TestCorruptProtobuf(self, sanitize):
+    """Test failure cases for DecodeToProto."""
+
+    # The goal here is to check the error reporting.
+    # Testing against a variety of corrupt protobufs is
+    # done by fuzzing.
+    corrupt_proto = 'This is not a binary protobuf'
+
+    # Numpy silently truncates the strings if you don't specify dtype=object.
+    batch = np.array(corrupt_proto, dtype=object)
+    msg_type = 'tensorflow.contrib.proto.TestCase'
+    field_names = ['sizes']
+    field_types = [dtypes.int32]
+
+    with self.test_session() as sess:
+      ctensor, vtensor = decode_proto_op.decode_proto(
+          batch,
+          message_type=msg_type,
+          field_names=field_names,
+          output_types=field_types,
+          sanitize=sanitize)
+      with self.assertRaisesRegexp(errors.DataLossError,
+                                   'Unable to parse binary protobuf'
+                                   '|Failed to consume entire buffer'):
+        _ = sess.run([ctensor] + vtensor)
+
+  def testCorrupt(self):
+    self._TestCorruptProtobuf(sanitize=False)
+
+  def testSanitizerCorrupt(self):
+    self._TestCorruptProtobuf(sanitize=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
new file mode 100644
index 0000000000..d1c13c82bc
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
@@ -0,0 +1,300 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Table-driven test for decode_proto op.
+
+This test is run once with each of the *.TestCase.pbtxt files
+in the test directory.
+"""
+# Python3 preparedness imports.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from google.protobuf import text_format
+
+from tensorflow.contrib.proto.python.kernel_tests import test_case
+from tensorflow.contrib.proto.python.kernel_tests import test_example_pb2
+from tensorflow.contrib.proto.python.ops import decode_proto_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('message_text_file', None,
+                    'A file containing a text serialized TestCase protobuf.')
+
+
+class DecodeProtoOpTest(test_case.ProtoOpTestCase):
+
+  def _compareValues(self, fd, vs, evs):
+    """Compare lists/arrays of field values."""
+
+    if len(vs) != len(evs):
+      self.fail('Field %s decoded %d outputs, expected %d' %
+                (fd.name, len(vs), len(evs)))
+    for i, ev in enumerate(evs):
+      # Special case fuzzy match for float32. TensorFlow seems to mess with
+      # MAX_FLT slightly and the test doesn't work otherwise.
+      # TODO(nix): ask on TF list about why MAX_FLT doesn't pass through.
+      if fd.cpp_type == fd.CPPTYPE_FLOAT:
+        # Numpy isclose() is better than assertIsClose() which uses an absolute
+        # value comparison.
+        self.assertTrue(
+            np.isclose(vs[i], ev), 'expected %r, actual %r' % (ev, vs[i]))
+      elif fd.cpp_type == fd.CPPTYPE_STRING:
+        # In Python3 string tensor values will be represented as bytes, so we
+        # reencode the proto values to match that.
+        self.assertEqual(vs[i], ev.encode('ascii'))
+      else:
+        # Doubles and other types pass through unscathed.
+        self.assertEqual(vs[i], ev)
+
+  def _compareRepeatedPrimitiveValue(self, batch_shape, sizes, fields,
+                                     field_dict):
+    """Compare protos of type RepeatedPrimitiveValue.
+
+    Args:
+      batch_shape: the shape of the input tensor of serialized messages.
+      sizes: int matrix of repeat counts returned by decode_proto
+      fields: list of test_example_pb2.FieldSpec (types and expected values)
+      field_dict: map from field names to decoded numpy tensors of values
+    """
+
+    # Check that expected values match.
+    for field in fields:
+      values = field_dict[field.name]
+      self.assertEqual(dtypes.as_dtype(values.dtype), field.dtype)
+
+      fd = field.expected.DESCRIPTOR.fields_by_name[field.name]
+
+      # Values has the same shape as the input plus an extra
+      # dimension for repeats.
+      self.assertEqual(list(values.shape)[:-1], batch_shape)
+
+      # Nested messages are represented as TF strings, requiring
+      # some special handling.
+      if field.name == 'message_value':
+        vs = []
+        for buf in values.flat:
+          msg = test_example_pb2.PrimitiveValue()
+          msg.ParseFromString(buf)
+          vs.append(msg)
+        evs = getattr(field.expected, field.name)
+        if len(vs) != len(evs):
+          self.fail('Field %s decoded %d outputs, expected %d' %
+                    (fd.name, len(vs), len(evs)))
+        for v, ev in zip(vs, evs):
+          self.assertEqual(v, ev)
+        continue
+
+      # This can be a little confusing. For testing we are using
+      # RepeatedPrimitiveValue in two ways: it's the proto that we
+      # decode for testing, and it's used in the expected value as a
+      # union type. The two cases are slightly different: this is the
+      # second case.
+      # We may be fetching the uint64_value from the test proto, but
+      # in the expected proto we store it in the int64_value field
+      # because TensorFlow doesn't support unsigned int64.
+      tf_type_to_primitive_value_field = {
+          dtypes.float32:
+              'float_value',
+          dtypes.float64:
+              'double_value',
+          dtypes.int32:
+              'int32_value',
+          dtypes.uint8:
+              'uint8_value',
+          dtypes.int8:
+              'int8_value',
+          dtypes.string:
+              'string_value',
+          dtypes.int64:
+              'int64_value',
+          dtypes.bool:
+              'bool_value',
+          # Unhandled TensorFlow types:
+          # DT_INT16 DT_COMPLEX64 DT_QINT8 DT_QUINT8 DT_QINT32
+          # DT_BFLOAT16 DT_QINT16 DT_QUINT16 DT_UINT16
+      }
+      tf_field_name = tf_type_to_primitive_value_field.get(field.dtype)
+      if tf_field_name is None:
+        self.fail('Unhandled tensorflow type %d' % field.dtype)
+
+      self._compareValues(fd, values.flat,
+                          getattr(field.expected, tf_field_name))
+
+  def _runDecodeProtoTests(self, fields, case_sizes, batch_shape, batch,
+                           message_type, message_format, sanitize,
+                           force_disordered=False):
+    """Run decode tests on a batch of messages.
+
+    Args:
+      fields: list of test_example_pb2.FieldSpec (types and expected values)
+      case_sizes: expected sizes array
+      batch_shape: the shape of the input tensor of serialized messages
+      batch: list of serialized messages
+      message_type: descriptor name for messages
+      message_format: format of messages, 'text' or 'binary'
+      sanitize: whether to sanitize binary protobuf inputs
+      force_disordered: whether to force fields encoded out of order.
+    """
+
+    if force_disordered:
+      # Exercise code path that handles out-of-order fields by prepending extra
+      # fields with tag numbers higher than any real field. Note that this won't
+      # work with sanitization because that forces reserialization using a
+      # trusted decoder and encoder.
+      assert not sanitize
+      extra_fields = test_example_pb2.ExtraFields()
+      extra_fields.string_value = 'IGNORE ME'
+      extra_fields.bool_value = False
+      extra_msg = extra_fields.SerializeToString()
+      batch = [extra_msg + msg for msg in batch]
+
+    # Numpy silently truncates the strings if you don't specify dtype=object.
+    batch = np.array(batch, dtype=object)
+    batch = np.reshape(batch, batch_shape)
+
+    field_names = [f.name for f in fields]
+    output_types = [f.dtype for f in fields]
+
+    with self.test_session() as sess:
+      sizes, vtensor = decode_proto_op.decode_proto(
+          batch,
+          message_type=message_type,
+          field_names=field_names,
+          output_types=output_types,
+          message_format=message_format,
+          sanitize=sanitize)
+
+      vlist = sess.run([sizes] + vtensor)
+      sizes = vlist[0]
+      # Values is a list of tensors, one for each field.
+      value_tensors = vlist[1:]
+
+      # Check that the repeat sizes are correct.
+      self.assertTrue(
+          np.all(np.array(sizes.shape) == batch_shape + [len(field_names)]))
+
+      # Check that the decoded sizes match the expected sizes.
+      self.assertEqual(len(sizes.flat), len(case_sizes))
+      self.assertTrue(
+          np.all(sizes.flat == np.array(
+              case_sizes, dtype=np.int32)))
+
+      field_dict = dict(zip(field_names, value_tensors))
+
+      self._compareRepeatedPrimitiveValue(batch_shape, sizes, fields,
+                                          field_dict)
+
+  def testBinary(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    batch = [primitive.SerializeToString() for primitive in case.primitive]
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        batch,
+        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
+        'binary',
+        sanitize=False)
+
+  def testBinaryDisordered(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    batch = [primitive.SerializeToString() for primitive in case.primitive]
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        batch,
+        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
+        'binary',
+        sanitize=False,
+        force_disordered=True)
+
+  def testPacked(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    # Now try with the packed serialization.
+    # We test the packed representations by loading the same test cases
+    # using PackedPrimitiveValue instead of RepeatedPrimitiveValue.
+    # To do this we rely on the text format being the same for packed and
+    # unpacked fields, and reparse the test message using the packed version
+    # of the proto.
+    packed_batch = [
+        # Note: float_format='.17g' is necessary to ensure preservation of
+        # doubles and floats in text format.
+        text_format.Parse(
+            text_format.MessageToString(
+                primitive, float_format='.17g'),
+            test_example_pb2.PackedPrimitiveValue()).SerializeToString()
+        for primitive in case.primitive
+    ]
+
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        packed_batch,
+        'tensorflow.contrib.proto.PackedPrimitiveValue',
+        'binary',
+        sanitize=False)
+
+  def testText(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    # Note: float_format='.17g' is necessary to ensure preservation of
+    # doubles and floats in text format.
+    text_batch = [
+        text_format.MessageToString(
+            primitive, float_format='.17g') for primitive in case.primitive
+    ]
+
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        text_batch,
+        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
+        'text',
+        sanitize=False)
+
+  def testSanitizerGood(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    batch = [primitive.SerializeToString() for primitive in case.primitive]
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        batch,
+        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
+        'binary',
+        sanitize=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
new file mode 100644
index 0000000000..30e58e6336
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
@@ -0,0 +1,180 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Table-driven test for encode_proto op.
+
+This test is run once with each of the *.TestCase.pbtxt files
+in the test directory.
+
+It tests that encode_proto is a lossless inverse of decode_proto
+(for the specified fields).
+"""
+# Python3 readiness boilerplate
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from google.protobuf import text_format
+
+from tensorflow.contrib.proto.python.kernel_tests import test_case
+from tensorflow.contrib.proto.python.kernel_tests import test_example_pb2
+from tensorflow.contrib.proto.python.ops import decode_proto_op
+from tensorflow.contrib.proto.python.ops import encode_proto_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('message_text_file', None,
+                    'A file containing a text serialized TestCase protobuf.')
+
+
+class EncodeProtoOpTest(test_case.ProtoOpTestCase):
+
+  def testBadInputs(self):
+    # Invalid field name
+    with self.test_session():
+      with self.assertRaisesOpError('Unknown field: non_existent_field'):
+        encode_proto_op.encode_proto(
+            sizes=[[1]],
+            values=[np.array([[0.0]], dtype=np.int32)],
+            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
+            field_names=['non_existent_field']).eval()
+
+    # Incorrect types.
+    with self.test_session():
+      with self.assertRaisesOpError(
+          'Incompatible type for field double_value.'):
+        encode_proto_op.encode_proto(
+            sizes=[[1]],
+            values=[np.array([[0.0]], dtype=np.int32)],
+            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
+            field_names=['double_value']).eval()
+
+    # Incorrect shapes of sizes.
+    with self.test_session():
+      with self.assertRaisesOpError(
+          r'sizes should be batch_size \+ \[len\(field_names\)\]'):
+        sizes = array_ops.placeholder(dtypes.int32)
+        values = array_ops.placeholder(dtypes.float64)
+        encode_proto_op.encode_proto(
+            sizes=sizes,
+            values=[values],
+            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
+            field_names=['double_value']).eval(feed_dict={
+                sizes: [[[0, 0]]],
+                values: [[0.0]]
+            })
+
+    # Inconsistent shapes of values.
+    with self.test_session():
+      with self.assertRaisesOpError(
+          'Values must match up to the last dimension'):
+        sizes = array_ops.placeholder(dtypes.int32)
+        values1 = array_ops.placeholder(dtypes.float64)
+        values2 = array_ops.placeholder(dtypes.int32)
+        (encode_proto_op.encode_proto(
+            sizes=[[1, 1]],
+            values=[values1, values2],
+            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
+            field_names=['double_value', 'int32_value']).eval(feed_dict={
+                values1: [[0.0]],
+                values2: [[0], [0]]
+            }))
+
+  def _testRoundtrip(self, in_bufs, message_type, fields):
+
+    field_names = [f.name for f in fields]
+    out_types = [f.dtype for f in fields]
+
+    with self.test_session() as sess:
+      sizes, field_tensors = decode_proto_op.decode_proto(
+          in_bufs,
+          message_type=message_type,
+          field_names=field_names,
+          output_types=out_types)
+
+      out_tensors = encode_proto_op.encode_proto(
+          sizes,
+          field_tensors,
+          message_type=message_type,
+          field_names=field_names)
+
+      out_bufs, = sess.run([out_tensors])
+
+      # Check that the re-encoded tensor has the same shape.
+      self.assertEqual(in_bufs.shape, out_bufs.shape)
+
+      # Compare the input and output.
+      for in_buf, out_buf in zip(in_bufs.flat, out_bufs.flat):
+        in_obj = test_example_pb2.RepeatedPrimitiveValue()
+        in_obj.ParseFromString(in_buf)
+
+        out_obj = test_example_pb2.RepeatedPrimitiveValue()
+        out_obj.ParseFromString(out_buf)
+
+        # Check that the deserialized objects are identical.
+        self.assertEqual(in_obj, out_obj)
+
+        # Check that the input and output serialized messages are identical.
+        # If we fail here, there is a difference in the serialized
+        # representation but the new serialization still parses. This could
+        # be harmless (a change in map ordering?) or it could be bad (e.g.
+        # loss of packing in the encoding).
+        self.assertEqual(in_buf, out_buf)
+
+  def testRoundtrip(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    in_bufs = [primitive.SerializeToString() for primitive in case.primitive]
+
+    # np.array silently truncates strings if you don't specify dtype=object.
+    in_bufs = np.reshape(np.array(in_bufs, dtype=object), list(case.shape))
+    return self._testRoundtrip(
+        in_bufs, 'tensorflow.contrib.proto.RepeatedPrimitiveValue', case.field)
+
+  def testRoundtripPacked(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    # Now try with the packed serialization.
+    # We test the packed representations by loading the same test cases
+    # using PackedPrimitiveValue instead of RepeatedPrimitiveValue.
+    # To do this we rely on the text format being the same for packed and
+    # unpacked fields, and reparse the test message using the packed version
+    # of the proto.
+    in_bufs = [
+        # Note: float_format='.17g' is necessary to ensure preservation of
+        # doubles and floats in text format.
+        text_format.Parse(
+            text_format.MessageToString(
+                primitive, float_format='.17g'),
+            test_example_pb2.PackedPrimitiveValue()).SerializeToString()
+        for primitive in case.primitive
+    ]
+
+    # np.array silently truncates strings if you don't specify dtype=object.
+    in_bufs = np.reshape(np.array(in_bufs, dtype=object), list(case.shape))
+    return self._testRoundtrip(
+        in_bufs, 'tensorflow.contrib.proto.PackedPrimitiveValue', case.field)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt
new file mode 100644
index 0000000000..b170f89c0f
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt
@@ -0,0 +1,161 @@
+primitive {
+  double_value: -1.7976931348623158e+308
+  double_value: 2.2250738585072014e-308
+  double_value: 1.7976931348623158e+308
+  float_value: -3.402823466e+38
+  float_value: 1.175494351e-38
+  float_value: 3.402823466e+38
+  int64_value: -9223372036854775808
+  int64_value: 9223372036854775807
+  uint64_value: 0
+  uint64_value: 18446744073709551615
+  int32_value: -2147483648
+  int32_value: 2147483647
+  fixed64_value: 0
+  fixed64_value: 18446744073709551615
+  fixed32_value: 0
+  fixed32_value: 4294967295
+  bool_value: false
+  bool_value: true
+  string_value: ""
+  string_value: "I refer to the infinite."
+  uint32_value: 0
+  uint32_value: 4294967295
+  sfixed32_value: -2147483648
+  sfixed32_value: 2147483647
+  sfixed64_value: -9223372036854775808
+  sfixed64_value: 9223372036854775807
+  sint32_value: -2147483648
+  sint32_value: 2147483647
+  sint64_value: -9223372036854775808
+  sint64_value: 9223372036854775807
+}
+shape: 1
+sizes: 3
+sizes: 3
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: -1.7976931348623158e+308
+    double_value: 2.2250738585072014e-308
+    double_value: 1.7976931348623158e+308
+  }
+}
+field {
+  name: "float_value"
+  dtype: DT_FLOAT
+  expected {
+    float_value: -3.402823466e+38
+    float_value: 1.175494351e-38
+    float_value: 3.402823466e+38
+  }
+}
+field {
+  name: "int64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: -9223372036854775808
+    int64_value: 9223372036854775807
+  }
+}
+field {
+  name: "uint64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: 0
+    int64_value: -1
+  }
+}
+field {
+  name: "int32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: -2147483648
+    int32_value: 2147483647
+  }
+}
+field {
+  name: "fixed64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: 0
+    int64_value: -1  # unsigned is 18446744073709551615
+  }
+}
+field {
+  name: "fixed32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: 0
+    int32_value: -1  # unsigned is 4294967295
+  }
+}
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: false
+    bool_value: true
+  }
+}
+field {
+  name: "string_value"
+  dtype: DT_STRING
+  expected {
+    string_value: ""
+    string_value: "I refer to the infinite."
+  }
+}
+field {
+  name: "uint32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: 0
+    int32_value: -1  # unsigned is 4294967295
+  }
+}
+field {
+  name: "sfixed32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: -2147483648
+    int32_value: 2147483647
+  }
+}
+field {
+  name: "sfixed64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: -9223372036854775808
+    int64_value: 9223372036854775807
+  }
+}
+field {
+  name: "sint32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: -2147483648
+    int32_value: 2147483647
+  }
+}
+field {
+  name: "sint64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: -9223372036854775808
+    int64_value: 9223372036854775807
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt
new file mode 100644
index 0000000000..c664e52851
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt
@@ -0,0 +1,16 @@
+primitive {
+  message_value {
+    double_value: 23.5
+  }
+}
+shape: 1
+sizes: 1
+field {
+  name: "message_value"
+  dtype: DT_STRING
+  expected {
+    message_value {
+      double_value: 23.5
+    }
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt
new file mode 100644
index 0000000000..125651d7ea
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt
@@ -0,0 +1,20 @@
+primitive {
+  bool_value: true
+}
+shape: 1
+sizes: 1
+sizes: 0
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: true
+  }
+}
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: 0.0
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
new file mode 100644
index 0000000000..db7555bf2d
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
@@ -0,0 +1,21 @@
+primitive {
+  fixed32_value: 4294967295
+  uint32_value: 4294967295
+}
+shape: 1
+sizes: 1
+sizes: 1
+field {
+  name: "fixed32_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: 4294967295
+  }
+}
+field {
+  name: "uint32_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: 4294967295
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt
new file mode 100644
index 0000000000..61c7ac53f7
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt
@@ -0,0 +1,32 @@
+primitive {
+  double_value: 23.5
+  double_value: 123.0
+  bool_value: true
+}
+primitive {
+  double_value: 3.1
+  bool_value: false
+}
+shape: 2
+sizes: 2
+sizes: 1
+sizes: 1
+sizes: 1
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: 23.5
+    double_value: 123.0
+    double_value: 3.1
+    double_value: 0.0
+  }
+}
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: true
+    bool_value: false
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt
new file mode 100644
index 0000000000..f4828076d5
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt
@@ -0,0 +1,62 @@
+primitive {
+  double_value: 23.5
+  bool_value: true
+}
+primitive {
+  double_value: 44.0
+  bool_value: false
+}
+primitive {
+  double_value: 3.14159
+  bool_value: true
+}
+primitive {
+  double_value: 1.414
+  bool_value: true
+}
+primitive {
+  double_value: -32.2
+  bool_value: false
+}
+primitive {
+  double_value: 0.0001
+  bool_value: true
+}
+shape: 3
+shape: 2
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: 23.5
+    double_value: 44.0
+    double_value: 3.14159
+    double_value: 1.414
+    double_value: -32.2
+    double_value: 0.0001
+  }
+}
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: true
+    bool_value: false
+    bool_value: true
+    bool_value: true
+    bool_value: false
+    bool_value: true
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt
new file mode 100644
index 0000000000..dc20ac147b
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt
@@ -0,0 +1,21 @@
+primitive {
+  double_value: 23.5
+  bool_value: true
+}
+shape: 1
+sizes: 1
+sizes: 1
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: 23.5
+  }
+}
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: true
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/test_case.py b/tensorflow/contrib/proto/python/kernel_tests/test_case.py
new file mode 100644
index 0000000000..b95202c5df
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/test_case.py
@@ -0,0 +1,35 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Test case base for testing proto operations."""
+
+# Python3 preparedness imports.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import ctypes as ct
+import os
+
+from tensorflow.python.platform import test
+
+
+class ProtoOpTestCase(test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    super(ProtoOpTestCase, self).__init__(methodName)
+    lib = os.path.join(os.path.dirname(__file__), 'libtestexample.so')
+    if os.path.isfile(lib):
+      ct.cdll.LoadLibrary(lib)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/test_example.proto b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
new file mode 100644
index 0000000000..dc495034ff
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
@@ -0,0 +1,149 @@
+// Test description and protos to work with it.
+//
+// Many of the protos in this file are for unit tests that haven't been written yet.
+
+syntax = "proto2";
+
+import "tensorflow/core/framework/types.proto";
+
+package tensorflow.contrib.proto;
+
+// A TestCase holds a proto and a bunch of assertions
+// about how it should decode.
+message TestCase {
+  // A batch of primitives to be serialized and decoded.
+  repeated RepeatedPrimitiveValue primitive = 1;
+  // The shape of the batch.
+  repeated int32 shape = 2;
+  // Expected sizes for each field.
+  repeated int32 sizes = 3;
+  // Expected values for each field.
+  repeated FieldSpec field = 4;
+};
+
+// FieldSpec describes the expected output for a single field.
+message FieldSpec {
+  optional string name = 1;
+  optional tensorflow.DataType dtype = 2;
+  optional RepeatedPrimitiveValue expected = 3;
+};
+
+message TestValue {
+  optional PrimitiveValue primitive_value = 1;
+  optional EnumValue enum_value = 2;
+  optional MessageValue message_value = 3;
+  optional RepeatedMessageValue repeated_message_value = 4;
+  optional RepeatedPrimitiveValue repeated_primitive_value = 6;
+}
+
+message PrimitiveValue {
+  optional double double_value = 1;
+  optional float float_value = 2;
+  optional int64 int64_value = 3;
+  optional uint64 uint64_value = 4;
+  optional int32 int32_value = 5;
+  optional fixed64 fixed64_value = 6;
+  optional fixed32 fixed32_value = 7;
+  optional bool bool_value = 8;
+  optional string string_value = 9;
+  optional bytes bytes_value = 12;
+  optional uint32 uint32_value = 13;
+  optional sfixed32 sfixed32_value = 15;
+  optional sfixed64 sfixed64_value = 16;
+  optional sint32 sint32_value = 17;
+  optional sint64 sint64_value = 18;
+}
+
+// NOTE: This definition must be kept in sync with PackedPrimitiveValue.
+message RepeatedPrimitiveValue {
+  repeated double double_value = 1;
+  repeated float float_value = 2;
+  repeated int64 int64_value = 3;
+  repeated uint64 uint64_value = 4;
+  repeated int32 int32_value = 5;
+  repeated fixed64 fixed64_value = 6;
+  repeated fixed32 fixed32_value = 7;
+  repeated bool bool_value = 8;
+  repeated string string_value = 9;
+  repeated bytes bytes_value = 12;
+  repeated uint32 uint32_value = 13;
+  repeated sfixed32 sfixed32_value = 15;
+  repeated sfixed64 sfixed64_value = 16;
+  repeated sint32 sint32_value = 17;
+  repeated sint64 sint64_value = 18;
+  repeated PrimitiveValue message_value = 19;
+}
+
+// A PackedPrimitiveValue looks exactly the same as a RepeatedPrimitiveValue
+// in the text format, but the binary serializion is different.
+// We test the packed representations by loading the same test cases
+// using this definition instead of RepeatedPrimitiveValue.
+// NOTE: This definition must be kept in sync with RepeatedPrimitiveValue
+// in every way except the packed=true declaration.
+message PackedPrimitiveValue {
+  repeated double double_value = 1 [packed = true];
+  repeated float float_value = 2 [packed = true];
+  repeated int64 int64_value = 3 [packed = true];
+  repeated uint64 uint64_value = 4 [packed = true];
+  repeated int32 int32_value = 5 [packed = true];
+  repeated fixed64 fixed64_value = 6 [packed = true];
+  repeated fixed32 fixed32_value = 7 [packed = true];
+  repeated bool bool_value = 8 [packed = true];
+  repeated string string_value = 9;
+  repeated bytes bytes_value = 12;
+  repeated uint32 uint32_value = 13 [packed = true];
+  repeated sfixed32 sfixed32_value = 15 [packed = true];
+  repeated sfixed64 sfixed64_value = 16 [packed = true];
+  repeated sint32 sint32_value = 17 [packed = true];
+  repeated sint64 sint64_value = 18 [packed = true];
+  repeated PrimitiveValue message_value = 19;
+}
+
+message EnumValue {
+  enum Color {
+    RED = 0;
+    ORANGE = 1;
+    YELLOW = 2;
+    GREEN = 3;
+    BLUE = 4;
+    INDIGO = 5;
+    VIOLET = 6;
+  };
+  optional Color enum_value = 14;
+  repeated Color repeated_enum_value = 15;
+}
+
+
+message InnerMessageValue {
+  optional float float_value = 2;
+  repeated bytes bytes_values = 8;
+}
+
+message MiddleMessageValue {
+  repeated int32 int32_values = 5;
+  optional InnerMessageValue message_value = 11;
+  optional uint32 uint32_value = 13;
+}
+
+message MessageValue {
+  optional double double_value = 1;
+  optional MiddleMessageValue message_value = 11;
+}
+
+message RepeatedMessageValue {
+  message NestedMessageValue {
+    optional float float_value = 2;
+    repeated bytes bytes_values = 8;
+  }
+
+  repeated NestedMessageValue message_values = 11;
+}
+
+// Message containing fields with field numbers higher than any field above. An
+// instance of this message is prepended to each binary message in the test to
+// exercise the code path that handles fields encoded out of order of field
+// number.
+message ExtraFields {
+  optional string string_value = 1776;
+  optional bool bool_value = 1777;
+}
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 376644718f..a0bae23a7c 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -74,6 +74,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
     "//tensorflow/contrib/nn:nn_py",
     "//tensorflow/contrib/predictor:predictor_pip",
+    "//tensorflow/contrib/proto:proto_pip",
     "//tensorflow/contrib/receptive_field:receptive_field_pip",
     "//tensorflow/contrib/session_bundle:session_bundle_pip",
     "//tensorflow/contrib/signal:signal_py",
-- 
GitLab


From d995be2debded727f2b99bb87c0d209604a5bb4b Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Mon, 16 Apr 2018 14:47:31 -0700
Subject: [PATCH 047/450] Porting tests for `rpc_op` to OS.

PiperOrigin-RevId: 193102564
---
 tensorflow/contrib/BUILD                      |   1 +
 tensorflow/contrib/__init__.py                |   1 +
 tensorflow/contrib/cmake/tf_python.cmake      |   3 +-
 tensorflow/contrib/rpc/BUILD                  |  16 +
 .../contrib/rpc/python/kernel_tests/BUILD     |  80 +++++
 .../rpc/python/kernel_tests/rpc_op_test.py    |  71 ++++
 .../python/kernel_tests/rpc_op_test_base.py   | 336 ++++++++++++++++++
 .../kernel_tests/rpc_op_test_servicer.py      | 101 ++++++
 .../python/kernel_tests/test_example.proto    | 171 +++++++++
 .../core/platform/default/build_config.bzl    |  86 ++++-
 tensorflow/tools/pip_package/BUILD            |   1 +
 tensorflow/workspace.bzl                      |   4 +
 12 files changed, 867 insertions(+), 4 deletions(-)
 create mode 100644 tensorflow/contrib/rpc/python/kernel_tests/BUILD
 create mode 100644 tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
 create mode 100644 tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
 create mode 100644 tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py
 create mode 100644 tensorflow/contrib/rpc/python/kernel_tests/test_example.proto

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index ae68f4aec4..7e47516550 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -87,6 +87,7 @@ py_library(
         "//tensorflow/contrib/remote_fused_graph/pylib:remote_fused_graph_ops_py",
         "//tensorflow/contrib/resampler:resampler_py",
         "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/contrib/rpc",
         "//tensorflow/contrib/saved_model:saved_model_py",
         "//tensorflow/contrib/seq2seq:seq2seq_py",
         "//tensorflow/contrib/signal:signal_py",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index e27ece8fa5..36cc5144d0 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -71,6 +71,7 @@ from tensorflow.contrib import recurrent
 from tensorflow.contrib import reduce_slice_ops
 from tensorflow.contrib import resampler
 from tensorflow.contrib import rnn
+from tensorflow.contrib import rpc
 from tensorflow.contrib import saved_model
 from tensorflow.contrib import seq2seq
 from tensorflow.contrib import signal
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 21f59d2563..f6aaf41f73 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -347,7 +347,8 @@ GENERATE_PYTHON_OP_LIB("random_ops")
 GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/remote_fused_graph/pylib/python/ops/gen_remote_fused_graph_ops.py)
 GENERATE_PYTHON_OP_LIB("resource_variable_ops")
-GENERATE_PYTHON_OP_LIB("rpc_ops")
+GENERATE_PYTHON_OP_LIB("rpc_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rpc/python/ops/gen_rpc_op.py)
 GENERATE_PYTHON_OP_LIB("script_ops")
 GENERATE_PYTHON_OP_LIB("sdca_ops")
 GENERATE_PYTHON_OP_LIB("set_ops")
diff --git a/tensorflow/contrib/rpc/BUILD b/tensorflow/contrib/rpc/BUILD
index 597f18c771..dbd311a276 100644
--- a/tensorflow/contrib/rpc/BUILD
+++ b/tensorflow/contrib/rpc/BUILD
@@ -4,6 +4,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
 py_library(
     name = "rpc",
     srcs = [
@@ -11,3 +13,17 @@ py_library(
     ],
     deps = ["//tensorflow/contrib/rpc/python/ops:rpc_op_py"],
 )
+
+py_library(
+    name = "rpc_pip",
+    data = if_static(
+        [],
+        otherwise = ["//tensorflow/contrib/rpc/python/kernel_tests:libtestexample.so"],
+    ),
+    deps = [
+        ":rpc",
+        "//tensorflow/contrib/rpc/python/kernel_tests:py_test_deps",
+        "//tensorflow/contrib/rpc/python/kernel_tests:rpc_op_test_base",
+        "//tensorflow/contrib/rpc/python/kernel_tests:rpc_op_test_servicer",
+    ],
+)
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/BUILD b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
new file mode 100644
index 0000000000..2311c15a68
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
@@ -0,0 +1,80 @@
+# TODO(b/76425722): Port everything in here to OS (currently excluded).
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+# Placeholder for loading internal BUILD rule.
+
+tf_proto_library(
+    name = "test_example_proto",
+    srcs = ["test_example.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/core:protos_all"],
+)
+
+py_library(
+    name = "py_test_deps",
+    deps = [":test_example_proto_py"],
+)
+
+py_library(
+    name = "rpc_op_test_base",
+    srcs = ["rpc_op_test_base.py"],
+    deps = [
+        ":test_example_proto_py",
+        "//tensorflow/contrib/proto",
+        "//tensorflow/contrib/rpc",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "rpc_op_test_servicer",
+    srcs = ["rpc_op_test_servicer.py"],
+    deps = [
+        ":py_test_deps",
+        ":rpc_op_test_base",
+        "//tensorflow/core:protos_all_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_cc_shared_object(
+    name = "libtestexample.so",
+    linkstatic = 1,
+    deps = [
+        ":test_example_proto_cc",
+    ],
+)
+
+tf_py_test(
+    name = "rpc_op_test",
+    size = "small",
+    srcs = ["rpc_op_test.py"],
+    additional_deps = [
+        ":py_test_deps",
+        ":rpc_op_test_base",
+        ":rpc_op_test_servicer",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+    ],
+    data = if_static(
+        [],
+        otherwise = [":libtestexample.so"],
+    ),
+    tags = [
+        "no_pip",  # TODO(b/78026780)
+        "no_windows",  # TODO(b/78028010)
+    ],
+)
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
new file mode 100644
index 0000000000..e2e0dbc7a2
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
@@ -0,0 +1,71 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Tests for RpcOp."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import ctypes as ct
+import os
+
+import grpc
+from grpc.framework.foundation import logging_pool
+import portpicker
+
+from tensorflow.contrib.rpc.python.kernel_tests import rpc_op_test_base
+from tensorflow.contrib.rpc.python.kernel_tests import rpc_op_test_servicer
+from tensorflow.contrib.rpc.python.kernel_tests import test_example_pb2_grpc
+from tensorflow.python.platform import test
+
+
+class RpcOpTest(test.TestCase, rpc_op_test_base.RpcOpTestBase):
+  _protocol = 'grpc'
+
+  invalid_method_string = 'Method not found'
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    super(RpcOpTest, self).__init__(methodName)
+    lib = os.path.join(os.path.dirname(__file__), 'libtestexample.so')
+    if os.path.isfile(lib):
+      ct.cdll.LoadLibrary(lib)
+
+  def get_method_name(self, suffix):
+    return '/tensorflow.contrib.rpc.TestCaseService/%s' % suffix
+
+  def setUp(self):
+    super(RpcOpTest, self).setUp()
+
+    service_port = portpicker.pick_unused_port()
+
+    server = grpc.server(logging_pool.pool(max_workers=25))
+    servicer = rpc_op_test_servicer.RpcOpTestServicer()
+    test_example_pb2_grpc.add_TestCaseServiceServicer_to_server(
+        servicer, server)
+    self._address = 'localhost:%d' % service_port
+    server.add_insecure_port(self._address)
+    server.start()
+    self._server = server
+
+  def tearDown(self):
+    # TODO(ebrevdo): Figure out why this sometimes times out.
+    #    self._service.ExitLoop()
+    #    self._service_thread.join()
+    # self._server.stop()
+    super(RpcOpTest, self).tearDown()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
new file mode 100644
index 0000000000..89f3ee1a1c
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
@@ -0,0 +1,336 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Base class for RpcOp tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+
+from tensorflow.contrib.proto.python.ops import decode_proto_op
+from tensorflow.contrib.proto.python.ops import encode_proto_op
+from tensorflow.contrib.rpc.python.kernel_tests import test_example_pb2
+from tensorflow.contrib.rpc.python.ops import rpc_op
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+
+__all__ = ['I_WARNED_YOU', 'RpcOpTestBase']
+
+I_WARNED_YOU = 'I warned you!'
+
+
+class RpcOpTestBase(object):
+  # pylint: disable=missing-docstring,invalid-name
+  """Base class for RpcOp tests."""
+
+  def get_method_name(self, suffix):
+    raise NotImplementedError
+
+  def rpc(self, *args, **kwargs):
+    return rpc_op.rpc(*args, protocol=self._protocol, **kwargs)
+
+  def try_rpc(self, *args, **kwargs):
+    return rpc_op.try_rpc(*args, protocol=self._protocol, **kwargs)
+
+  def testScalarHostPortRpc(self):
+    with self.test_session() as sess:
+      request_tensors = (
+          test_example_pb2.TestCase(shape=[1, 2, 3]).SerializeToString())
+      response_tensors = self.rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      self.assertEqual(response_tensors.shape, ())
+      response_values = sess.run(response_tensors)
+    response_message = test_example_pb2.TestCase()
+    self.assertTrue(response_message.ParseFromString(response_values))
+    self.assertAllEqual([2, 3, 4], response_message.shape)
+
+  def testScalarHostPortTryRpc(self):
+    with self.test_session() as sess:
+      request_tensors = (
+          test_example_pb2.TestCase(shape=[1, 2, 3]).SerializeToString())
+      response_tensors, status_code, status_message = self.try_rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      self.assertEqual(status_code.shape, ())
+      self.assertEqual(status_message.shape, ())
+      self.assertEqual(response_tensors.shape, ())
+      response_values, status_code_values, status_message_values = (
+          sess.run((response_tensors, status_code, status_message)))
+    response_message = test_example_pb2.TestCase()
+    self.assertTrue(response_message.ParseFromString(response_values))
+    self.assertAllEqual([2, 3, 4], response_message.shape)
+    # For the base Rpc op, don't expect to get error status back.
+    self.assertEqual(errors.OK, status_code_values)
+    self.assertEqual(b'', status_message_values)
+
+  def testEmptyHostPortRpc(self):
+    with self.test_session() as sess:
+      request_tensors = []
+      response_tensors = self.rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      self.assertAllEqual(response_tensors.shape, [0])
+      response_values = sess.run(response_tensors)
+    self.assertAllEqual(response_values.shape, [0])
+
+  def testInvalidAddresses(self):
+    with self.test_session() as sess:
+      with self.assertRaisesOpError(self.invalid_method_string):
+        sess.run(
+            self.rpc(
+                method='/InvalidService.IncrementTestShapes',
+                address=self._address,
+                request=''))
+
+      with self.assertRaisesOpError(self.invalid_method_string):
+        sess.run(
+            self.rpc(
+                method=self.get_method_name('InvalidMethodName'),
+                address=self._address,
+                request=''))
+
+      # This also covers the case of address=''
+      # and address='localhost:293874293874'
+      with self.assertRaises(errors.UnavailableError):
+        sess.run(
+            self.rpc(
+                method=self.get_method_name('IncrementTestShapes'),
+                address='unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@',
+                request=''))
+
+      # Test invalid method with the TryRpc op
+      _, status_code_value, status_message_value = sess.run(
+          self.try_rpc(
+              method=self.get_method_name('InvalidMethodName'),
+              address=self._address,
+              request=''))
+      self.assertEqual(errors.UNIMPLEMENTED, status_code_value)
+      self.assertTrue(
+          self.invalid_method_string in status_message_value.decode('ascii'))
+
+  def testAlwaysFailingMethod(self):
+    with self.test_session() as sess:
+      response_tensors = self.rpc(
+          method=self.get_method_name('AlwaysFailWithInvalidArgument'),
+          address=self._address,
+          request='')
+      self.assertEqual(response_tensors.shape, ())
+      with self.assertRaisesOpError(I_WARNED_YOU):
+        sess.run(response_tensors)
+
+  def testSometimesFailingMethodWithManyRequests(self):
+    with self.test_session() as sess:
+      # Fail hard by default.
+      response_tensors = self.rpc(
+          method=self.get_method_name('SometimesFailWithInvalidArgument'),
+          address=self._address,
+          request=[''] * 20)
+      self.assertEqual(response_tensors.shape, (20,))
+      with self.assertRaisesOpError(I_WARNED_YOU):
+        sess.run(response_tensors)
+
+      # Don't fail hard, use TryRpc - return the failing status instead.
+      response_tensors, status_code, status_message = self.try_rpc(
+          method=self.get_method_name('SometimesFailWithInvalidArgument'),
+          address=self._address,
+          request=[''] * 20)
+      self.assertEqual(response_tensors.shape, (20,))
+      self.assertEqual(status_code.shape, (20,))
+      self.assertEqual(status_message.shape, (20,))
+      status_code_values, status_message_values = sess.run((status_code,
+                                                            status_message))
+      self.assertTrue([
+          x in (errors.OK, errors.INVALID_ARGUMENT) for x in status_code_values
+      ])
+      expected_message_values = np.where(
+          status_code_values == errors.INVALID_ARGUMENT,
+          I_WARNED_YOU.encode('ascii'), b'')
+      self.assertAllEqual(expected_message_values, status_message_values)
+
+  def testVecHostPortRpc(self):
+    with self.test_session() as sess:
+      request_tensors = [
+          test_example_pb2.TestCase(
+              shape=[i, i + 1, i + 2]).SerializeToString() for i in range(20)
+      ]
+      response_tensors = self.rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      self.assertEqual(response_tensors.shape, (20,))
+      response_values = sess.run(response_tensors)
+    self.assertEqual(response_values.shape, (20,))
+    for i in range(20):
+      response_message = test_example_pb2.TestCase()
+      self.assertTrue(response_message.ParseFromString(response_values[i]))
+      self.assertAllEqual([i + 1, i + 2, i + 3], response_message.shape)
+
+  def testVecHostPortManyParallelRpcs(self):
+    with self.test_session() as sess:
+      request_tensors = [
+          test_example_pb2.TestCase(
+              shape=[i, i + 1, i + 2]).SerializeToString() for i in range(20)
+      ]
+      many_response_tensors = [
+          self.rpc(
+              method=self.get_method_name('IncrementTestShapes'),
+              address=self._address,
+              request=request_tensors) for _ in range(10)
+      ]
+      # Launch parallel 10 calls to the RpcOp, each containing
+      # 20 rpc requests.
+      many_response_values = sess.run(many_response_tensors)
+    self.assertEqual(10, len(many_response_values))
+    for response_values in many_response_values:
+      self.assertEqual(response_values.shape, (20,))
+      for i in range(20):
+        response_message = test_example_pb2.TestCase()
+        self.assertTrue(response_message.ParseFromString(response_values[i]))
+        self.assertAllEqual([i + 1, i + 2, i + 3], response_message.shape)
+
+  def testVecHostPortRpcUsingEncodeAndDecodeProto(self):
+    with self.test_session() as sess:
+      request_tensors = encode_proto_op.encode_proto(
+          message_type='tensorflow.contrib.rpc.TestCase',
+          field_names=['shape'],
+          sizes=[[3]] * 20,
+          values=[
+              [[i, i + 1, i + 2] for i in range(20)],
+          ])
+      response_tensor_strings = self.rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      _, (response_shape,) = decode_proto_op.decode_proto(
+          bytes=response_tensor_strings,
+          message_type='tensorflow.contrib.rpc.TestCase',
+          field_names=['shape'],
+          output_types=[dtypes.int32])
+      response_shape_values = sess.run(response_shape)
+    self.assertAllEqual([[i + 1, i + 2, i + 3]
+                         for i in range(20)], response_shape_values)
+
+  def testVecHostPortRpcCancelsUponSessionTimeOutWhenSleepingForever(self):
+    with self.test_session() as sess:
+      request_tensors = [''] * 25  # This will launch 25 RPC requests.
+      response_tensors = self.rpc(
+          method=self.get_method_name('SleepForever'),
+          address=self._address,
+          request=request_tensors)
+      for timeout_ms in [1, 500, 1000]:
+        options = config_pb2.RunOptions(timeout_in_ms=timeout_ms)
+        with self.assertRaises((errors.UnavailableError,
+                                errors.DeadlineExceededError)):
+          sess.run(response_tensors, options=options)
+
+  def testVecHostPortRpcCancelsUponConfiguredTimeOutWhenSleepingForever(self):
+    with self.test_session() as sess:
+      request_tensors = [''] * 25  # This will launch 25 RPC requests.
+      response_tensors = self.rpc(
+          method=self.get_method_name('SleepForever'),
+          address=self._address,
+          timeout_in_ms=1000,
+          request=request_tensors)
+      with self.assertRaises(errors.DeadlineExceededError):
+        sess.run(response_tensors)
+
+  def testTryRpcPropagatesDeadlineErrorWithSometimesTimingOutRequests(self):
+    with self.test_session() as sess:
+      response_tensors, status_code, status_message = self.try_rpc(
+          method=self.get_method_name('SometimesSleepForever'),
+          timeout_in_ms=1000,
+          address=self._address,
+          request=[''] * 20)
+      self.assertEqual(response_tensors.shape, (20,))
+      self.assertEqual(status_code.shape, (20,))
+      self.assertEqual(status_message.shape, (20,))
+      status_code_values = sess.run(status_code)
+      self.assertTrue([
+          x in (errors.OK, errors.DEADLINE_EXCEEDED) for x in status_code_values
+      ])
+
+  def testTryRpcWithMultipleAddressesSingleRequest(self):
+    flatten = lambda x: list(itertools.chain.from_iterable(x))
+    with self.test_session() as sess:
+      addresses = flatten([[
+          self._address, 'unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@'
+      ] for _ in range(10)])
+      request = test_example_pb2.TestCase(shape=[0, 1, 2]).SerializeToString()
+      response_tensors, status_code, _ = self.try_rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=addresses,
+          request=request)
+      response_tensors_values, status_code_values = sess.run((response_tensors,
+                                                              status_code))
+      self.assertAllEqual(
+          flatten([errors.OK, errors.UNAVAILABLE] for _ in range(10)),
+          status_code_values)
+      for i in range(10):
+        self.assertTrue(response_tensors_values[2 * i])
+        self.assertFalse(response_tensors_values[2 * i + 1])
+
+  def testTryRpcWithMultipleMethodsSingleRequest(self):
+    flatten = lambda x: list(itertools.chain.from_iterable(x))
+    with self.test_session() as sess:
+      methods = flatten(
+          [[self.get_method_name('IncrementTestShapes'), 'InvalidMethodName']
+           for _ in range(10)])
+      request = test_example_pb2.TestCase(shape=[0, 1, 2]).SerializeToString()
+      response_tensors, status_code, _ = self.try_rpc(
+          method=methods, address=self._address, request=request)
+      response_tensors_values, status_code_values = sess.run((response_tensors,
+                                                              status_code))
+      self.assertAllEqual(
+          flatten([errors.OK, errors.UNIMPLEMENTED] for _ in range(10)),
+          status_code_values)
+      for i in range(10):
+        self.assertTrue(response_tensors_values[2 * i])
+        self.assertFalse(response_tensors_values[2 * i + 1])
+
+  def testTryRpcWithMultipleAddressesAndRequests(self):
+    flatten = lambda x: list(itertools.chain.from_iterable(x))
+    with self.test_session() as sess:
+      addresses = flatten([[
+          self._address, 'unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@'
+      ] for _ in range(10)])
+      requests = [
+          test_example_pb2.TestCase(
+              shape=[i, i + 1, i + 2]).SerializeToString() for i in range(20)
+      ]
+      response_tensors, status_code, _ = self.try_rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=addresses,
+          request=requests)
+      response_tensors_values, status_code_values = sess.run((response_tensors,
+                                                              status_code))
+      self.assertAllEqual(
+          flatten([errors.OK, errors.UNAVAILABLE] for _ in range(10)),
+          status_code_values)
+      for i in range(20):
+        if i % 2 == 1:
+          self.assertFalse(response_tensors_values[i])
+        else:
+          response_message = test_example_pb2.TestCase()
+          self.assertTrue(
+              response_message.ParseFromString(response_tensors_values[i]))
+          self.assertAllEqual([i + 1, i + 2, i + 3], response_message.shape)
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py
new file mode 100644
index 0000000000..7cbd636cb1
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py
@@ -0,0 +1,101 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Test servicer for RpcOp tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+import time
+
+import grpc
+
+from tensorflow.contrib.rpc.python.kernel_tests import rpc_op_test_base
+from tensorflow.contrib.rpc.python.kernel_tests import test_example_pb2_grpc
+
+
+class RpcOpTestServicer(test_example_pb2_grpc.TestCaseServiceServicer):
+  """Test servicer for RpcOp tests."""
+
+  def IncrementTestShapes(self, request, context):
+    """Increment the entries in the shape attribute of request.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    for i in range(len(request.shape)):
+      request.shape[i] += 1
+    return request
+
+  def AlwaysFailWithInvalidArgument(self, request, context):
+    """Always fails with an InvalidArgument status.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    del request
+    context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
+    context.set_details(rpc_op_test_base.I_WARNED_YOU)
+
+  def SometimesFailWithInvalidArgument(self, request, context):
+    """Sometimes fails with an InvalidArgument status.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    if random.randint(0, 1) == 1:
+      context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
+      context.set_details(rpc_op_test_base.I_WARNED_YOU)
+    return request
+
+  def SleepForever(self, request, context):
+    """Sleeps forever.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    # TODO(ebrevdo): Make this async wait like the stubby version.
+    time.sleep(5)
+
+  def SometimesSleepForever(self, request, context):
+    """Sometimes sleeps forever.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    if random.randint(0, 1) == 1:
+      time.sleep(5)
+    return request
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/test_example.proto b/tensorflow/contrib/rpc/python/kernel_tests/test_example.proto
new file mode 100644
index 0000000000..96f4550f62
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/test_example.proto
@@ -0,0 +1,171 @@
+// Test description and protos to work with it.
+//
+// Many of the protos in this file are for unit tests that haven't been written yet.
+
+syntax = "proto2";
+
+import "tensorflow/core/framework/types.proto";
+
+package tensorflow.contrib.rpc;
+
+// A TestCase holds a proto and a bunch of assertions
+// about how it should decode.
+message TestCase {
+  // A batch of primitives to be serialized and decoded.
+  repeated RepeatedPrimitiveValue primitive = 1;
+  // The shape of the batch.
+  repeated int32 shape = 2;
+  // Expected sizes for each field.
+  repeated int32 sizes = 3;
+  // Expected values for each field.
+  repeated FieldSpec field = 4;
+};
+
+service TestCaseService {
+  // Copy input, and increment each entry in 'shape' by 1.
+  rpc IncrementTestShapes(TestCase) returns (TestCase) {
+  }
+
+  // Sleep forever.
+  rpc SleepForever(TestCase) returns (TestCase) {
+  }
+
+  // Sleep forever 50% of the time, return immediately the other 50%.
+  rpc SometimesSleepForever(TestCase) returns (TestCase) {
+  }
+
+  // Always fails with InvalidArgument.
+  rpc AlwaysFailWithInvalidArgument(TestCase) returns (TestCase) {
+  }
+
+  // Fails with InvalidArgument 50% of the time.
+  rpc SometimesFailWithInvalidArgument(TestCase) returns (TestCase) {
+  }
+};
+
+// FieldSpec describes the expected output for a single field.
+message FieldSpec {
+  optional string name = 1;
+  optional tensorflow.DataType dtype = 2;
+  optional RepeatedPrimitiveValue expected = 3;
+};
+
+message TestValue {
+  optional PrimitiveValue primitive_value = 1;
+  optional EnumValue enum_value = 2;
+  optional MessageValue message_value = 3;
+  optional RepeatedMessageValue repeated_message_value = 4;
+  optional RepeatedPrimitiveValue repeated_primitive_value = 6;
+}
+
+message PrimitiveValue {
+  optional double double_value = 1;
+  optional float float_value = 2;
+  optional int64 int64_value = 3;
+  optional uint64 uint64_value = 4;
+  optional int32 int32_value = 5;
+  optional fixed64 fixed64_value = 6;
+  optional fixed32 fixed32_value = 7;
+  optional bool bool_value = 8;
+  optional string string_value = 9;
+  optional bytes bytes_value = 12;
+  optional uint32 uint32_value = 13;
+  optional sfixed32 sfixed32_value = 15;
+  optional sfixed64 sfixed64_value = 16;
+  optional sint32 sint32_value = 17;
+  optional sint64 sint64_value = 18;
+}
+
+// NOTE: This definition must be kept in sync with PackedPrimitiveValue.
+message RepeatedPrimitiveValue {
+  repeated double double_value = 1;
+  repeated float float_value = 2;
+  repeated int64 int64_value = 3;
+  repeated uint64 uint64_value = 4;
+  repeated int32 int32_value = 5;
+  repeated fixed64 fixed64_value = 6;
+  repeated fixed32 fixed32_value = 7;
+  repeated bool bool_value = 8;
+  repeated string string_value = 9;
+  repeated bytes bytes_value = 12;
+  repeated uint32 uint32_value = 13;
+  repeated sfixed32 sfixed32_value = 15;
+  repeated sfixed64 sfixed64_value = 16;
+  repeated sint32 sint32_value = 17;
+  repeated sint64 sint64_value = 18;
+  repeated PrimitiveValue message_value = 19;
+}
+
+// A PackedPrimitiveValue looks exactly the same as a RepeatedPrimitiveValue
+// in the text format, but the binary serializion is different.
+// We test the packed representations by loading the same test cases
+// using this definition instead of RepeatedPrimitiveValue.
+// NOTE: This definition must be kept in sync with RepeatedPrimitiveValue
+// in every way except the packed=true declaration.
+message PackedPrimitiveValue {
+  repeated double double_value = 1 [packed = true];
+  repeated float float_value = 2 [packed = true];
+  repeated int64 int64_value = 3 [packed = true];
+  repeated uint64 uint64_value = 4 [packed = true];
+  repeated int32 int32_value = 5 [packed = true];
+  repeated fixed64 fixed64_value = 6 [packed = true];
+  repeated fixed32 fixed32_value = 7 [packed = true];
+  repeated bool bool_value = 8 [packed = true];
+  repeated string string_value = 9;
+  repeated bytes bytes_value = 12;
+  repeated uint32 uint32_value = 13 [packed = true];
+  repeated sfixed32 sfixed32_value = 15 [packed = true];
+  repeated sfixed64 sfixed64_value = 16 [packed = true];
+  repeated sint32 sint32_value = 17 [packed = true];
+  repeated sint64 sint64_value = 18 [packed = true];
+  repeated PrimitiveValue message_value = 19;
+}
+
+message EnumValue {
+  enum Color {
+    RED = 0;
+    ORANGE = 1;
+    YELLOW = 2;
+    GREEN = 3;
+    BLUE = 4;
+    INDIGO = 5;
+    VIOLET = 6;
+  };
+  optional Color enum_value = 14;
+  repeated Color repeated_enum_value = 15;
+}
+
+
+message InnerMessageValue {
+  optional float float_value = 2;
+  repeated bytes bytes_values = 8;
+}
+
+message MiddleMessageValue {
+  repeated int32 int32_values = 5;
+  optional InnerMessageValue message_value = 11;
+  optional uint32 uint32_value = 13;
+}
+
+message MessageValue {
+  optional double double_value = 1;
+  optional MiddleMessageValue message_value = 11;
+}
+
+message RepeatedMessageValue {
+  message NestedMessageValue {
+    optional float float_value = 2;
+    repeated bytes bytes_values = 8;
+  }
+
+  repeated NestedMessageValue message_values = 11;
+}
+
+// Message containing fields with field numbers higher than any field above. An
+// instance of this message is prepended to each binary message in the test to
+// exercise the code path that handles fields encoded out of order of field
+// number.
+message ExtraFields {
+  optional string string_value = 1776;
+  optional bool bool_value = 1777;
+}
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index e01e076bcf..a43f5745c0 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -1,7 +1,6 @@
 # Platform-specific build configurations.
 
 load("@protobuf_archive//:protobuf.bzl", "proto_gen")
-load("@protobuf_archive//:protobuf.bzl", "py_proto_library")
 load("//tensorflow:tensorflow.bzl", "if_not_mobile")
 load("//tensorflow:tensorflow.bzl", "if_windows")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
@@ -110,6 +109,12 @@ def _proto_cc_srcs(srcs, use_grpc_plugin=False):
     ret += [s[:-len(".proto")] + ".grpc.pb.cc" for s in srcs]
   return ret
 
+def _proto_py_outs(srcs, use_grpc_plugin=False):
+  ret = [s[:-len(".proto")] + "_pb2.py" for s in srcs]
+  if use_grpc_plugin:
+    ret += [s[:-len(".proto")] + "_pb2_grpc.py" for s in srcs]
+  return ret
+
 # Re-defined protocol buffer rule to allow building "header only" protocol
 # buffers, to avoid duplicate registrations. Also allows non-iterable cc_libs
 # containing select() statements.
@@ -212,6 +217,80 @@ def cc_proto_library(
       hdrs=gen_hdrs,
       **kargs)
 
+# Re-defined protocol buffer rule to bring in the change introduced in commit
+# https://github.com/google/protobuf/commit/294b5758c373cbab4b72f35f4cb62dc1d8332b68
+# which was not part of a stable protobuf release in 04/2018.
+# TODO(jsimsa): Remove this once the protobuf dependency version is updated
+# to include the above commit.
+def py_proto_library(
+        name,
+        srcs=[],
+        deps=[],
+        py_libs=[],
+        py_extra_srcs=[],
+        include=None,
+        default_runtime="@protobuf_archive//:protobuf_python",
+        protoc="@protobuf_archive//:protoc",
+        use_grpc_plugin=False,
+        **kargs):
+  """Bazel rule to create a Python protobuf library from proto source files
+
+  NOTE: the rule is only an internal workaround to generate protos. The
+  interface may change and the rule may be removed when bazel has introduced
+  the native rule.
+
+  Args:
+    name: the name of the py_proto_library.
+    srcs: the .proto files of the py_proto_library.
+    deps: a list of dependency labels; must be py_proto_library.
+    py_libs: a list of other py_library targets depended by the generated
+        py_library.
+    py_extra_srcs: extra source files that will be added to the output
+        py_library. This attribute is used for internal bootstrapping.
+    include: a string indicating the include path of the .proto files.
+    default_runtime: the implicitly default runtime which will be depended on by
+        the generated py_library target.
+    protoc: the label of the protocol compiler to generate the sources.
+    use_grpc_plugin: a flag to indicate whether to call the Python C++ plugin
+        when processing the proto files.
+    **kargs: other keyword arguments that are passed to cc_library.
+  """
+  outs = _proto_py_outs(srcs, use_grpc_plugin)
+
+  includes = []
+  if include != None:
+    includes = [include]
+
+  grpc_python_plugin = None
+  if use_grpc_plugin:
+    grpc_python_plugin = "//external:grpc_python_plugin"
+    # Note: Generated grpc code depends on Python grpc module. This dependency
+    # is not explicitly listed in py_libs. Instead, host system is assumed to
+    # have grpc installed.
+
+  proto_gen(
+      name=name + "_genproto",
+      srcs=srcs,
+      deps=[s + "_genproto" for s in deps],
+      includes=includes,
+      protoc=protoc,
+      gen_py=1,
+      outs=outs,
+      visibility=["//visibility:public"],
+      plugin=grpc_python_plugin,
+      plugin_language="grpc"
+  )
+
+  if default_runtime and not default_runtime in py_libs + deps:
+    py_libs = py_libs + [default_runtime]
+
+  native.py_library(
+      name=name,
+      srcs=outs+py_extra_srcs,
+      deps=py_libs+deps,
+      imports=includes,
+      **kargs)
+
 def tf_proto_library_cc(name, srcs = [], has_services = None,
                         protodeps = [],
                         visibility = [], testonly = 0,
@@ -256,8 +335,7 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
   )
 
 def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[],
-                        testonly=0,
-                        srcs_version="PY2AND3"):
+                        testonly=0, srcs_version="PY2AND3", use_grpc_plugin=False):
   py_proto_library(
       name = name + "_py",
       srcs = srcs,
@@ -267,6 +345,7 @@ def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[],
       default_runtime = "@protobuf_archive//:protobuf_python",
       visibility = visibility,
       testonly = testonly,
+      use_grpc_plugin = use_grpc_plugin,
   )
 
 def tf_jspb_proto_library(**kwargs):
@@ -305,6 +384,7 @@ def tf_proto_library(name, srcs = [], has_services = None,
       srcs_version = "PY2AND3",
       testonly = testonly,
       visibility = visibility,
+      use_grpc_plugin = has_services,
   )
 
 def tf_additional_lib_hdrs(exclude = []):
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index a0bae23a7c..2ef105755f 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -76,6 +76,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/predictor:predictor_pip",
     "//tensorflow/contrib/proto:proto_pip",
     "//tensorflow/contrib/receptive_field:receptive_field_pip",
+    "//tensorflow/contrib/rpc:rpc_pip",
     "//tensorflow/contrib/session_bundle:session_bundle_pip",
     "//tensorflow/contrib/signal:signal_py",
     "//tensorflow/contrib/signal:test_util",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 018a395063..48728ac131 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -752,6 +752,10 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "grpc_cpp_plugin",
       actual = "@grpc//:grpc_cpp_plugin",
   )
+  native.bind(
+      name = "grpc_python_plugin",
+      actual = "@grpc//:grpc_python_plugin",
+  )
 
   # gRPC has three empty C++ functions which it wants the user to define
   # at build time. https://github.com/grpc/grpc/issues/13590
-- 
GitLab


From 113f102164e822aa15d1e875287009fef9d8b823 Mon Sep 17 00:00:00 2001
From: Younghee Kwon <youngheek@google.com>
Date: Mon, 16 Apr 2018 12:56:14 -0700
Subject: [PATCH 048/450] boosted_trees: Make some regularizer/hyper-params as
 inputs instead of attributes.

PiperOrigin-RevId: 193085059
---
 ...tedTreesCalculateBestGainsPerFeature.pbtxt | 38 +++++++--------
 .../api_def_BoostedTreesPredict.pbtxt         |  6 ---
 .../api_def_BoostedTreesTrainingPredict.pbtxt |  6 ---
 .../api_def_BoostedTreesUpdateEnsemble.pbtxt  |  4 +-
 .../kernels/boosted_trees/prediction_ops.cc   | 16 +++----
 .../core/kernels/boosted_trees/stats_ops.cc   | 44 ++++++++++--------
 .../kernels/boosted_trees/training_ops.cc     | 19 ++++----
 tensorflow/core/ops/boosted_trees_ops.cc      | 36 +++++----------
 .../core/ops/compat/ops_history.v1.pbtxt      | 46 +++++++------------
 .../python/estimator/canned/boosted_trees.py  |  6 +--
 .../boosted_trees/prediction_ops_test.py      | 14 +-----
 11 files changed, 96 insertions(+), 139 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
index 62876a293c..7f18c64574 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
@@ -11,6 +11,24 @@ END
     name: "stats_summary_list"
     description: <<END
 A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+l1 regularization factor on leaf weights, per instance based.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+l2 regularization factor on leaf weights, per instance based.
+END
+  }
+  in_arg {
+    name: "tree_complexity"
+    description: <<END
+adjustment to the gain, per leaf based.
 END
   }
   out_arg {
@@ -41,24 +59,6 @@ END
     name: "right_node_contribs_list"
     description: <<END
 A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
-END
-  }
-  attr {
-    name: "l1"
-    description: <<END
-l1 regularization factor on leaf weights, per instance based.
-END
-  }
-  attr {
-    name: "l2"
-    description: <<END
-l2 regularization factor on leaf weights, per instance based.
-END
-  }
-  attr {
-    name: "tree_complexity"
-    description: <<END
-adjustment to the gain, per leaf based.
 END
   }
   attr {
@@ -84,4 +84,4 @@ In this manner, the output is the best split per features and per node, so that
 The length of output lists are all of the same length, `num_features`.
 The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
 END
-}
\ No newline at end of file
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt
index b23e77a1fa..60ad9b4640 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt
@@ -25,12 +25,6 @@ END
     description: <<END
 scalar, dimension of the logits, to be used for partial logits
 shape.
-END
-  }
-  attr {
-    name: "max_depth"
-    description: <<END
-scalar, max depth of trees. To be used for parallelization costs.
 END
   }
   summary: "Runs multiple additive regression ensemble predictors on input instances and"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt
index 7203d3cb58..f8a3639c9b 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt
@@ -52,12 +52,6 @@ END
     description: <<END
 scalar, dimension of the logits, to be used for partial logits
 shape.
-END
-  }
-  attr {
-    name: "max_depth"
-    description: <<END
-scalar, max depth of trees. To be used for parallelization costs.
 END
   }
   summary: "Runs multiple additive regression ensemble predictors on input instances and"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
index 00f8953875..3cf486d087 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
@@ -51,13 +51,13 @@ of the feature's splits. Will be added to the previous node values to constitute
 the values of the right nodes.
 END
   }
-  attr {
+  in_arg {
     name: "max_depth"
     description: <<END
 Max depth of the tree to build.
 END
   }
-  attr {
+  in_arg {
     name: "learning_rate"
     description: <<END
 shrinkage const for each new tree.
diff --git a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
index b13a450546..1b5ce32b7b 100644
--- a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
@@ -50,7 +50,6 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
     OP_REQUIRES(context, logits_dimension_ == 1,
                 errors::InvalidArgument(
                     "Currently only one dimensional outputs are supported."));
-    OP_REQUIRES_OK(context, context->GetAttr("max_depth", &max_depth_));
   }
 
   void Compute(OpKernelContext* const context) override {
@@ -155,9 +154,10 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
           output_partial_logits(i, 0) = partial_all_logit;
         }
       };
-      // Assume we will not go over more than one full tree. 4 is a magic
-      // number.
-      const int64 cost = 4 * max_depth_;
+      // 30 is the magic number. The actual value might be a function of (the
+      // number of layers) * (cpu cycles spent on each layer), but this value
+      // would work for many cases. May be tuned later.
+      const int64 cost = 30;
       thread::ThreadPool* const worker_threads =
           context->device()->tensorflow_cpu_worker_threads()->workers;
       Shard(worker_threads->NumThreads(), worker_threads, batch_size,
@@ -168,7 +168,6 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
  private:
   int32 logits_dimension_;         // the size of the output prediction vector.
   int32 num_bucketized_features_;  // Indicates the number of features.
-  int32 max_depth_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("BoostedTreesTrainingPredict").Device(DEVICE_CPU),
@@ -186,7 +185,6 @@ class BoostedTreesPredictOp : public OpKernel {
     OP_REQUIRES(context, logits_dimension_ == 1,
                 errors::InvalidArgument(
                     "Currently only one dimensional outputs are supported."));
-    OP_REQUIRES_OK(context, context->GetAttr("max_depth", &max_depth_));
   }
 
   void Compute(OpKernelContext* const context) override {
@@ -243,7 +241,10 @@ class BoostedTreesPredictOp : public OpKernel {
         output_logits(i, 0) = tree_logit;
       }
     };
-    const int64 cost = (latest_tree + 1) * max_depth_;
+    // 10 is the magic number. The actual number might depend on (the number of
+    // layers in the trees) and (cpu cycles spent on each layer), but this
+    // value would work for many cases. May be tuned later.
+    const int64 cost = (latest_tree + 1) * 10;
     thread::ThreadPool* const worker_threads =
         context->device()->tensorflow_cpu_worker_threads()->workers;
     Shard(worker_threads->NumThreads(), worker_threads, batch_size,
@@ -254,7 +255,6 @@ class BoostedTreesPredictOp : public OpKernel {
   int32
       logits_dimension_;  // Indicates the size of the output prediction vector.
   int32 num_bucketized_features_;  // Indicates the number of features.
-  int32 max_depth_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("BoostedTreesPredict").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/boosted_trees/stats_ops.cc b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
index 16e65cf284..40f50333d3 100644
--- a/tensorflow/core/kernels/boosted_trees/stats_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
@@ -29,10 +29,6 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
   explicit BoostedTreesCalculateBestGainsPerFeatureOp(
       OpKernelConstruction* const context)
       : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("l1", &l1_));
-    OP_REQUIRES_OK(context, context->GetAttr("l2", &l2_));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("tree_complexity", &tree_complexity_));
     OP_REQUIRES_OK(context, context->GetAttr("max_splits", &max_splits_));
     OP_REQUIRES_OK(context, context->GetAttr("num_features", &num_features_));
   }
@@ -54,6 +50,16 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
     for (const auto& tensor : stats_summary_list) {
       stats_summary.emplace_back(tensor.tensor<float, 3>());
     }
+    const Tensor* l1_t;
+    OP_REQUIRES_OK(context, context->input("l1", &l1_t));
+    const auto l1 = l1_t->scalar<float>()();
+    const Tensor* l2_t;
+    OP_REQUIRES_OK(context, context->input("l2", &l2_t));
+    const auto l2 = l2_t->scalar<float>()();
+    const Tensor* tree_complexity_t;
+    OP_REQUIRES_OK(context,
+                   context->input("tree_complexity", &tree_complexity_t));
+    const auto tree_complexity = tree_complexity_t->scalar<float>()();
 
     // Allocate output lists of tensors:
     OpOutputList output_node_ids_list;
@@ -106,7 +112,8 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
         // Parent gain.
         float parent_gain;
         float unused;
-        CalculateWeightsAndGains(total_grad, total_hess, &unused, &parent_gain);
+        CalculateWeightsAndGains(total_grad, total_hess, l1, l2, &unused,
+                                 &parent_gain);
 
         for (int bucket = 0; bucket < num_buckets; ++bucket) {
           const float cum_grad_bucket = cum_grad[bucket];
@@ -114,13 +121,13 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
           // Left child.
           float contrib_for_left;
           float gain_for_left;
-          CalculateWeightsAndGains(cum_grad_bucket, cum_hess_bucket,
+          CalculateWeightsAndGains(cum_grad_bucket, cum_hess_bucket, l1, l2,
                                    &contrib_for_left, &gain_for_left);
           // Right child.
           float contrib_for_right;
           float gain_for_right;
           CalculateWeightsAndGains(total_grad - cum_grad_bucket,
-                                   total_hess - cum_hess_bucket,
+                                   total_hess - cum_hess_bucket, l1, l2,
                                    &contrib_for_right, &gain_for_right);
 
           if (gain_for_left + gain_for_right > best_gain) {
@@ -173,7 +180,7 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
       for (int i = 0; i < num_nodes; ++i) {
         output_node_ids_vec(i) = output_node_ids[i];
         // Adjust the gains to penalize by tree complexity.
-        output_gains_vec(i) = output_gains[i] - tree_complexity_;
+        output_gains_vec(i) = output_gains[i] - tree_complexity;
         output_thresholds_vec(i) = output_thresholds[i];
         // Logits are 1-dimensional for now.
         // TODO(nponomareva): Consider multi-dimensional logits.
@@ -184,8 +191,8 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
   }
 
  private:
-  void CalculateWeightsAndGains(const float g, const float h, float* weight,
-                                float* gain) {
+  void CalculateWeightsAndGains(const float g, const float h, const float l1,
+                                const float l2, float* weight, float* gain) {
     //
     // The formula for weight is -(g+l1*sgn(w))/(H+l2), for gain it is
     // (g+l1*sgn(w))^2/(h+l2).
@@ -196,11 +203,11 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
     // 1) Assume w>0 => w=-(g+l1)/(h+l2)=> g+l1 < 0 => g < -l1
     // 2) Assume w<0 => w=-(g-l1)/(h+l2)=> g-l1 > 0 => g > l1
     // For g from (-l1, l1), thus there is no solution => set to 0.
-    if (l1_ > 0) {
-      if (g > l1_) {
-        g_with_l1 -= l1_;
-      } else if (g < -l1_) {
-        g_with_l1 += l1_;
+    if (l1 > 0) {
+      if (g > l1) {
+        g_with_l1 -= l1;
+      } else if (g < -l1) {
+        g_with_l1 += l1;
       } else {
         *weight = 0.0;
         *gain = 0.0;
@@ -208,19 +215,16 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
       }
     }
     // Apply L2 regularization.
-    if (h + l2_ <= kEps) {
+    if (h + l2 <= kEps) {
       // Avoid division by 0 or infinitesimal.
       *weight = 0;
       *gain = 0;
     } else {
-      *weight = -g_with_l1 / (h + l2_);
+      *weight = -g_with_l1 / (h + l2);
       *gain = -g_with_l1 * (*weight);
     }
   }
 
-  float l1_;
-  float l2_;
-  float tree_complexity_;
   int max_splits_;
   int num_features_;
 };
diff --git a/tensorflow/core/kernels/boosted_trees/training_ops.cc b/tensorflow/core/kernels/boosted_trees/training_ops.cc
index 67cac14c52..a14fd4a133 100644
--- a/tensorflow/core/kernels/boosted_trees/training_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/training_ops.cc
@@ -43,8 +43,6 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
  public:
   explicit BoostedTreesUpdateEnsembleOp(OpKernelConstruction* const context)
       : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("max_depth", &max_depth_));
-    OP_REQUIRES_OK(context, context->GetAttr("learning_rate", &learning_rate_));
     OP_REQUIRES_OK(context, context->GetAttr("num_features", &num_features_));
 
     int32 pruning_index;
@@ -79,8 +77,15 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
 
     const Tensor* feature_ids_t;
     OP_REQUIRES_OK(context, context->input("feature_ids", &feature_ids_t));
+    const auto feature_ids = feature_ids_t->vec<int32>();
 
-    auto feature_ids = feature_ids_t->vec<int32>();
+    const Tensor* max_depth_t;
+    OP_REQUIRES_OK(context, context->input("max_depth", &max_depth_t));
+    const auto max_depth = max_depth_t->scalar<int32>()();
+
+    const Tensor* learning_rate_t;
+    OP_REQUIRES_OK(context, context->input("learning_rate", &learning_rate_t));
+    const auto learning_rate = learning_rate_t->scalar<float>()();
 
     // Find best splits for each active node.
     std::map<int32, SplitCandidate> best_splits;
@@ -125,10 +130,10 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
       // For now assume that the weights vectors are one dimensional.
       // TODO(nponomareva): change here for multiclass.
       const float left_contrib =
-          learning_rate_ *
+          learning_rate *
           left_node_contribs[feature_idx].matrix<float>()(candidate_idx, 0);
       const float right_contrib =
-          learning_rate_ *
+          learning_rate *
           right_node_contribs[feature_idx].matrix<float>()(candidate_idx, 0);
 
       // unused.
@@ -145,7 +150,7 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
       // Update growable tree metadata.
       ensemble_resource->SetNumLayersGrown(current_tree, new_num_layers);
       // Finalize the tree if needed.
-      if (ensemble_resource->GetNumLayersGrown(current_tree) >= max_depth_) {
+      if (ensemble_resource->GetNumLayersGrown(current_tree) >= max_depth) {
         // If the tree is finalized, next growing will start from node 0;
         node_id_start = 0;
         node_id_end = 1;
@@ -216,8 +221,6 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
 
  private:
   int32 num_features_;
-  float learning_rate_;
-  int32 max_depth_;
   PruningMode pruning_mode_;
 };
 
diff --git a/tensorflow/core/ops/boosted_trees_ops.cc b/tensorflow/core/ops/boosted_trees_ops.cc
index 8af4903418..4d74e6d63a 100644
--- a/tensorflow/core/ops/boosted_trees_ops.cc
+++ b/tensorflow/core/ops/boosted_trees_ops.cc
@@ -37,9 +37,9 @@ REGISTER_OP("IsBoostedTreesEnsembleInitialized")
 REGISTER_OP("BoostedTreesCalculateBestGainsPerFeature")
     .Input("node_id_range: int32")
     .Input("stats_summary_list: num_features * float32")
-    .Attr("l1: float")
-    .Attr("l2: float")
-    .Attr("tree_complexity: float")
+    .Input("l1: float")
+    .Input("l2: float")
+    .Input("tree_complexity: float")
     .Attr("max_splits: int >= 1")
     .Attr("num_features: int >= 1")  // not passed but populated automatically.
     .Output("node_ids_list: num_features * int32")
@@ -51,19 +51,6 @@ REGISTER_OP("BoostedTreesCalculateBestGainsPerFeature")
       // Confirms the rank of the inputs and sets the shape of the outputs.
       int max_splits;
       int num_features;
-      float l1, l2, tree_complexity;
-      TF_RETURN_IF_ERROR(c->GetAttr("l1", &l1));
-      if (l1 < 0) {
-        return errors::InvalidArgument("l1 must be non-negative.");
-      }
-      TF_RETURN_IF_ERROR(c->GetAttr("l2", &l2));
-      if (l2 < 0) {
-        return errors::InvalidArgument("l2 must be non-negative.");
-      }
-      TF_RETURN_IF_ERROR(c->GetAttr("tree_complexity", &tree_complexity));
-      if (tree_complexity < 0) {
-        return errors::InvalidArgument("Tree complexity must be non-negative.");
-      }
       TF_RETURN_IF_ERROR(c->GetAttr("max_splits", &max_splits));
       TF_RETURN_IF_ERROR(c->GetAttr("num_features", &num_features));
       shape_inference::ShapeHandle node_id_range_shape;
@@ -83,6 +70,12 @@ REGISTER_OP("BoostedTreesCalculateBestGainsPerFeature")
         TF_RETURN_IF_ERROR(
             c->Merge(summary_shape_base, summary_shape, &unused_shape));
       }
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(num_features + 1), 0, &unused_shape));
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(num_features + 2), 0, &unused_shape));
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(num_features + 3), 0, &unused_shape));
       // Sets the output lists.
       std::vector<shape_inference::ShapeHandle> output_shapes_vec(
           num_features, c->MakeShape({-1}));
@@ -185,9 +178,8 @@ REGISTER_OP("BoostedTreesMakeStatsSummary")
 REGISTER_OP("BoostedTreesPredict")
     .Input("tree_ensemble_handle: resource")
     .Input("bucketized_features: num_bucketized_features * int32")
-    .Attr("num_bucketized_features: int >= 1")
+    .Attr("num_bucketized_features: int >= 1")  // Inferred.
     .Attr("logits_dimension: int")
-    .Attr("max_depth: int >= 1")
     .Output("logits: float")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle feature_shape;
@@ -229,7 +221,6 @@ REGISTER_OP("BoostedTreesTrainingPredict")
     .Input("bucketized_features: num_bucketized_features * int32")
     .Attr("num_bucketized_features: int >= 1")
     .Attr("logits_dimension: int")
-    .Attr("max_depth: int >= 1")
     .Output("partial_logits: float")
     .Output("tree_ids: int32")
     .Output("node_ids: int32")
@@ -239,9 +230,6 @@ REGISTER_OP("BoostedTreesTrainingPredict")
       TF_RETURN_IF_ERROR(
           c->GetAttr("num_bucketized_features", &num_bucketized_features));
 
-      int max_depth;
-      TF_RETURN_IF_ERROR(c->GetAttr("max_depth", &max_depth));
-
       shape_inference::ShapeHandle unused_input;
       for (int i = 0; i < num_bucketized_features; ++i) {
         TF_RETURN_IF_ERROR(c->WithRank(c->input(i + 3), 1, &feature_shape));
@@ -273,8 +261,8 @@ REGISTER_OP("BoostedTreesUpdateEnsemble")
     .Input("thresholds: num_features * int32")
     .Input("left_node_contribs: num_features * float")
     .Input("right_node_contribs: num_features * float")
-    .Attr("max_depth: int >= 1")
-    .Attr("learning_rate: float")
+    .Input("max_depth: int32")
+    .Input("learning_rate: float")
     .Attr("pruning_mode: int >=0")
     .Attr("num_features: int >= 0")  // Inferred.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 2f6f588d2c..c627fee352 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -10735,6 +10735,18 @@ op {
     type: DT_FLOAT
     number_attr: "num_features"
   }
+  input_arg {
+    name: "l1"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "l2"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "tree_complexity"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "node_ids_list"
     type: DT_INT32
@@ -10760,18 +10772,6 @@ op {
     type: DT_FLOAT
     number_attr: "num_features"
   }
-  attr {
-    name: "l1"
-    type: "float"
-  }
-  attr {
-    name: "l2"
-    type: "float"
-  }
-  attr {
-    name: "tree_complexity"
-    type: "float"
-  }
   attr {
     name: "max_splits"
     type: "int"
@@ -10934,12 +10934,6 @@ op {
     name: "logits_dimension"
     type: "int"
   }
-  attr {
-    name: "max_depth"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
   is_stateful: true
 }
 op {
@@ -10999,12 +10993,6 @@ op {
     name: "logits_dimension"
     type: "int"
   }
-  attr {
-    name: "max_depth"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
   is_stateful: true
 }
 op {
@@ -11042,15 +11030,13 @@ op {
     type: DT_FLOAT
     number_attr: "num_features"
   }
-  attr {
+  input_arg {
     name: "max_depth"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    type: DT_INT32
   }
-  attr {
+  input_arg {
     name: "learning_rate"
-    type: "float"
+    type: DT_FLOAT
   }
   attr {
     name: "pruning_mode"
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 0ecc8c7089..d099d308f5 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -325,8 +325,7 @@ def _bt_model_fn(
           # so no local copy is needed; using tree_ensemble directly.
           tree_ensemble_handle=tree_ensemble.resource_handle,
           bucketized_features=input_feature_list,
-          logits_dimension=head.logits_dimension,
-          max_depth=tree_hparams.max_depth)
+          logits_dimension=head.logits_dimension)
     else:
       if is_single_machine:
         local_tree_ensemble = tree_ensemble
@@ -361,8 +360,7 @@ def _bt_model_fn(
             cached_tree_ids=cached_tree_ids,
             cached_node_ids=cached_node_ids,
             bucketized_features=input_feature_list,
-            logits_dimension=head.logits_dimension,
-            max_depth=tree_hparams.max_depth)
+            logits_dimension=head.logits_dimension)
       logits = cached_logits + partial_logits
 
     # Create training graph.
diff --git a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
index d132f15e51..54f33f3360 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
@@ -49,7 +49,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=2,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -116,7 +115,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=2,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values],
@@ -189,7 +187,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=4,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -299,7 +296,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=4,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -429,7 +425,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=2,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -562,7 +557,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=3,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -705,7 +699,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=3,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -782,7 +775,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=1,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -905,8 +897,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       predict_op = boosted_trees_ops.predict(
           tree_ensemble_handle,
           bucketized_features=[feature_0_values, feature_1_values],
-          logits_dimension=1,
-          max_depth=2)
+          logits_dimension=1)
 
       logits = session.run(predict_op)
       self.assertAllClose(expected_logits, logits)
@@ -915,8 +906,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       predict_op = boosted_trees_ops.predict(
           tree_ensemble_handle,
           bucketized_features=[feature_0_values, feature_1_values],
-          logits_dimension=1,
-          max_depth=2)
+          logits_dimension=1)
 
       logits = session.run(predict_op)
       self.assertAllClose(expected_logits, logits)
-- 
GitLab


From 91129bbb3cbc01c7ecc776048988ae83ba50e3c1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 18:03:05 -0700
Subject: [PATCH 049/450] Adding min node weight regularization

PiperOrigin-RevId: 193131300
---
 .../python/estimator/boosted_trees.py         | 18 +++-
 ...tedTreesCalculateBestGainsPerFeature.pbtxt |  8 +-
 .../core/kernels/boosted_trees/stats_ops.cc   |  9 ++
 tensorflow/core/ops/boosted_trees_ops.cc      |  1 +
 .../core/ops/compat/ops_history.v1.pbtxt      |  4 +
 .../python/estimator/canned/boosted_trees.py  | 85 ++++++++++---------
 .../estimator/canned/boosted_trees_test.py    |  3 +-
 .../boosted_trees/stats_ops_test.py           | 51 +++++++++++
 ....estimator.-boosted-trees-classifier.pbtxt |  2 +-
 ...w.estimator.-boosted-trees-regressor.pbtxt |  2 +-
 10 files changed, 138 insertions(+), 45 deletions(-)

diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
index 314c54ed00..00356ce0ca 100644
--- a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
@@ -36,6 +36,7 @@ class _BoostedTreesEstimator(estimator.Estimator):
                l1_regularization=0.,
                l2_regularization=0.,
                tree_complexity=0.,
+               min_node_weight=0.,
                config=None):
     """Initializes a `BoostedTreesEstimator` instance.
 
@@ -65,13 +66,16 @@ class _BoostedTreesEstimator(estimator.Estimator):
       l2_regularization: regularization multiplier applied to the square weights
         of the tree leafs.
       tree_complexity: regularization factor to penalize trees with more leaves.
+      min_node_weight: minimum hessian a node must have for a split to be
+        considered. The value will be compared with sum(leaf_hessian)/
+        (batch_size * n_batches_per_layer).
       config: `RunConfig` object to configure the runtime settings.
     """
     # pylint:disable=protected-access
     # HParams for the model.
     tree_hparams = canned_boosted_trees._TreeHParams(
         n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-        tree_complexity)
+        tree_complexity, min_node_weight)
 
     def _model_fn(features, labels, mode, config):
       return canned_boosted_trees._bt_model_fn(
@@ -96,6 +100,7 @@ def boosted_trees_classifier_train_in_memory(
     l1_regularization=0.,
     l2_regularization=0.,
     tree_complexity=0.,
+    min_node_weight=0.,
     config=None,
     train_hooks=None):
   """Trains a boosted tree classifier with in memory dataset.
@@ -162,6 +167,9 @@ def boosted_trees_classifier_train_in_memory(
     l2_regularization: regularization multiplier applied to the square weights
       of the tree leafs.
     tree_complexity: regularization factor to penalize trees with more leaves.
+    min_node_weight: minimum hessian a node must have for a split to be
+        considered. The value will be compared with sum(leaf_hessian)/
+        (batch_size * n_batches_per_layer).
     config: `RunConfig` object to configure the runtime settings.
     train_hooks: a list of Hook instances to be passed to estimator.train().
 
@@ -184,7 +192,7 @@ def boosted_trees_classifier_train_in_memory(
   # HParams for the model.
   tree_hparams = canned_boosted_trees._TreeHParams(
       n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-      tree_complexity)
+      tree_complexity, min_node_weight)
 
   def _model_fn(features, labels, mode, config):
     return canned_boosted_trees._bt_model_fn(
@@ -220,6 +228,7 @@ def boosted_trees_regressor_train_in_memory(
     l1_regularization=0.,
     l2_regularization=0.,
     tree_complexity=0.,
+    min_node_weight=0.,
     config=None,
     train_hooks=None):
   """Trains a boosted tree regressor with in memory dataset.
@@ -279,6 +288,9 @@ def boosted_trees_regressor_train_in_memory(
     l2_regularization: regularization multiplier applied to the square weights
       of the tree leafs.
     tree_complexity: regularization factor to penalize trees with more leaves.
+    min_node_weight: minimum hessian a node must have for a split to be
+        considered. The value will be compared with sum(leaf_hessian)/
+        (batch_size * n_batches_per_layer).
     config: `RunConfig` object to configure the runtime settings.
     train_hooks: a list of Hook instances to be passed to estimator.train().
 
@@ -300,7 +312,7 @@ def boosted_trees_regressor_train_in_memory(
   # HParams for the model.
   tree_hparams = canned_boosted_trees._TreeHParams(
       n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-      tree_complexity)
+      tree_complexity, min_node_weight)
 
   def _model_fn(features, labels, mode, config):
     return canned_boosted_trees._bt_model_fn(
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
index 7f18c64574..3f181e91ce 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
@@ -29,6 +29,12 @@ END
     name: "tree_complexity"
     description: <<END
 adjustment to the gain, per leaf based.
+END
+  }
+  in_arg {
+    name: "min_node_weight"
+    description: <<END
+mininum avg of hessians in a node before required for the node to be considered for splitting.
 END
   }
   out_arg {
@@ -84,4 +90,4 @@ In this manner, the output is the best split per features and per node, so that
 The length of output lists are all of the same length, `num_features`.
 The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
 END
-}
+}
\ No newline at end of file
diff --git a/tensorflow/core/kernels/boosted_trees/stats_ops.cc b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
index 40f50333d3..6dfcd63ab3 100644
--- a/tensorflow/core/kernels/boosted_trees/stats_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
@@ -60,6 +60,10 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->input("tree_complexity", &tree_complexity_t));
     const auto tree_complexity = tree_complexity_t->scalar<float>()();
+    const Tensor* min_node_weight_t;
+    OP_REQUIRES_OK(context,
+                   context->input("min_node_weight", &min_node_weight_t));
+    const auto min_node_weight = min_node_weight_t->scalar<float>()();
 
     // Allocate output lists of tensors:
     OpOutputList output_node_ids_list;
@@ -105,6 +109,11 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
           cum_grad.push_back(total_grad);
           cum_hess.push_back(total_hess);
         }
+        // Check if node has enough of average hessian.
+        if (total_hess < min_node_weight) {
+          // Do not split the node because not enough avg hessian.
+          continue;
+        }
         float best_gain = std::numeric_limits<float>::lowest();
         float best_bucket = 0;
         float best_contrib_for_left = 0.0;
diff --git a/tensorflow/core/ops/boosted_trees_ops.cc b/tensorflow/core/ops/boosted_trees_ops.cc
index 4d74e6d63a..88d6eaf819 100644
--- a/tensorflow/core/ops/boosted_trees_ops.cc
+++ b/tensorflow/core/ops/boosted_trees_ops.cc
@@ -40,6 +40,7 @@ REGISTER_OP("BoostedTreesCalculateBestGainsPerFeature")
     .Input("l1: float")
     .Input("l2: float")
     .Input("tree_complexity: float")
+    .Input("min_node_weight: float")
     .Attr("max_splits: int >= 1")
     .Attr("num_features: int >= 1")  // not passed but populated automatically.
     .Output("node_ids_list: num_features * int32")
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index c627fee352..4a24c44d69 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -10747,6 +10747,10 @@ op {
     name: "tree_complexity"
     type: DT_FLOAT
   }
+  input_arg {
+    name: "min_node_weight"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "node_ids_list"
     type: DT_INT32
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index d099d308f5..536bd2bf81 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -40,9 +40,11 @@ from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
 from tensorflow.python.util.tf_export import tf_export
 
-_TreeHParams = collections.namedtuple(
-    'TreeHParams',
-    ['n_trees', 'max_depth', 'learning_rate', 'l1', 'l2', 'tree_complexity'])
+# TODO(nponomareva): Reveal pruning params here.
+_TreeHParams = collections.namedtuple('TreeHParams', [
+    'n_trees', 'max_depth', 'learning_rate', 'l1', 'l2', 'tree_complexity',
+    'min_node_weight'
+])
 
 _HOLD_FOR_MULTI_CLASS_SUPPORT = object()
 _HOLD_FOR_MULTI_DIM_SUPPORT = object()
@@ -397,6 +399,7 @@ def _bt_model_fn(
                  l1=tree_hparams.l1,
                  l2=tree_hparams.l2,
                  tree_complexity=tree_hparams.tree_complexity,
+                 min_node_weight=tree_hparams.min_node_weight,
                  max_splits=max_splits))
         grow_op = boosted_trees_ops.update_ensemble(
             # Confirm if local_tree_ensemble or tree_ensemble should be used.
@@ -515,21 +518,21 @@ def _create_regression_head(label_dimension, weight_column=None):
 class BoostedTreesClassifier(estimator.Estimator):
   """A Classifier for Tensorflow Boosted Trees models."""
 
-  def __init__(
-      self,
-      feature_columns,
-      n_batches_per_layer,
-      model_dir=None,
-      n_classes=_HOLD_FOR_MULTI_CLASS_SUPPORT,
-      weight_column=None,
-      label_vocabulary=None,
-      n_trees=100,
-      max_depth=6,
-      learning_rate=0.1,
-      l1_regularization=0.,
-      l2_regularization=0.,
-      tree_complexity=0.,
-      config=None):
+  def __init__(self,
+               feature_columns,
+               n_batches_per_layer,
+               model_dir=None,
+               n_classes=_HOLD_FOR_MULTI_CLASS_SUPPORT,
+               weight_column=None,
+               label_vocabulary=None,
+               n_trees=100,
+               max_depth=6,
+               learning_rate=0.1,
+               l1_regularization=0.,
+               l2_regularization=0.,
+               tree_complexity=0.,
+               min_node_weight=0.,
+               config=None):
     """Initializes a `BoostedTreesClassifier` instance.
 
     Example:
@@ -593,6 +596,9 @@ class BoostedTreesClassifier(estimator.Estimator):
       l2_regularization: regularization multiplier applied to the square weights
         of the tree leafs.
       tree_complexity: regularization factor to penalize trees with more leaves.
+      min_node_weight: min_node_weight: minimum hessian a node must have for a
+        split to be considered. The value will be compared with
+        sum(leaf_hessian)/(batch_size * n_batches_per_layer).
       config: `RunConfig` object to configure the runtime settings.
 
     Raises:
@@ -606,9 +612,9 @@ class BoostedTreesClassifier(estimator.Estimator):
         n_classes, weight_column, label_vocabulary=label_vocabulary)
 
     # HParams for the model.
-    tree_hparams = _TreeHParams(
-        n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-        tree_complexity)
+    tree_hparams = _TreeHParams(n_trees, max_depth, learning_rate,
+                                l1_regularization, l2_regularization,
+                                tree_complexity, min_node_weight)
 
     def _model_fn(features, labels, mode, config):
       return _bt_model_fn(  # pylint: disable=protected-access
@@ -630,20 +636,20 @@ class BoostedTreesClassifier(estimator.Estimator):
 class BoostedTreesRegressor(estimator.Estimator):
   """A Regressor for Tensorflow Boosted Trees models."""
 
-  def __init__(
-      self,
-      feature_columns,
-      n_batches_per_layer,
-      model_dir=None,
-      label_dimension=_HOLD_FOR_MULTI_DIM_SUPPORT,
-      weight_column=None,
-      n_trees=100,
-      max_depth=6,
-      learning_rate=0.1,
-      l1_regularization=0.,
-      l2_regularization=0.,
-      tree_complexity=0.,
-      config=None):
+  def __init__(self,
+               feature_columns,
+               n_batches_per_layer,
+               model_dir=None,
+               label_dimension=_HOLD_FOR_MULTI_DIM_SUPPORT,
+               weight_column=None,
+               n_trees=100,
+               max_depth=6,
+               learning_rate=0.1,
+               l1_regularization=0.,
+               l2_regularization=0.,
+               tree_complexity=0.,
+               min_node_weight=0.,
+               config=None):
     """Initializes a `BoostedTreesRegressor` instance.
 
     Example:
@@ -700,6 +706,9 @@ class BoostedTreesRegressor(estimator.Estimator):
       l2_regularization: regularization multiplier applied to the square weights
         of the tree leafs.
       tree_complexity: regularization factor to penalize trees with more leaves.
+      min_node_weight: min_node_weight: minimum hessian a node must have for a
+        split to be considered. The value will be compared with
+        sum(leaf_hessian)/(batch_size * n_batches_per_layer).
       config: `RunConfig` object to configure the runtime settings.
 
     Raises:
@@ -712,9 +721,9 @@ class BoostedTreesRegressor(estimator.Estimator):
     head = _create_regression_head(label_dimension, weight_column)
 
     # HParams for the model.
-    tree_hparams = _TreeHParams(
-        n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-        tree_complexity)
+    tree_hparams = _TreeHParams(n_trees, max_depth, learning_rate,
+                                l1_regularization, l2_regularization,
+                                tree_complexity, min_node_weight)
 
     def _model_fn(features, labels, mode, config):
       return _bt_model_fn(  # pylint: disable=protected-access
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
index 7823ef8410..56e67a6707 100644
--- a/tensorflow/python/estimator/canned/boosted_trees_test.py
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -188,7 +188,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         learning_rate=0.1,
         l1=0.,
         l2=0.01,
-        tree_complexity=0.)
+        tree_complexity=0.,
+        min_node_weight=0.)
 
   def _get_expected_ensembles_for_classification(self):
     first_round = """
diff --git a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
index 4d09cf94d4..f0bb84e69a 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
@@ -59,6 +59,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           l1=0.0,
           l2=0.0,
           tree_complexity=0.0,
+          min_node_weight=0,
           max_splits=max_splits)
 
       self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
@@ -106,6 +107,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           l1=0.0,
           l2=0.1,
           tree_complexity=0.0,
+          min_node_weight=0,
           max_splits=max_splits)
 
       self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
@@ -154,6 +156,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           l1=l1,
           l2=0.0,
           tree_complexity=0.0,
+          min_node_weight=0,
           max_splits=max_splits)
 
       self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
@@ -205,6 +208,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           l1=0.0,
           l2=l2,
           tree_complexity=tree_complexity,
+          min_node_weight=0,
           max_splits=max_splits)
 
       self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
@@ -220,6 +224,53 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose([[[-.424658], [-.6]], [[-.043478], [.485294]]],
                           sess.run(right_node_contribs_list))
 
+  def testCalculateBestGainsWithMinNodeWEight(self):
+    """Testing Gain calculation without any regularization."""
+    with self.test_session() as sess:
+      max_splits = 7
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
+      stats_summary_list = [
+          [
+              [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.15, .036], [.06, .07], [.1, .2]],  # node 1
+              [[0., 0.], [-.33, .68], [0., 0.], [.3, .4]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 0
+          [
+              [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.3, .5], [-.05, .6], [.06, .07]],  # node 1
+              [[.1, .1], [.2, .03], [-.4, .05], [.07, .08]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 1
+      ]  # num_features * shape=[max_splits, num_buckets, 2]
+
+      (node_ids_list, gains_list, thresholds_list, left_node_contribs_list,
+       right_node_contribs_list
+      ) = boosted_trees_ops.calculate_best_gains_per_feature(
+          node_id_range,
+          stats_summary_list,
+          l1=0.0,
+          l2=0.0,
+          tree_complexity=0.0,
+          min_node_weight=1,
+          max_splits=max_splits)
+
+      # We can't split node 1 on feature 1 and node 2 on feature 2 because of
+      # the min node weight.
+      self.assertAllEqual([[2], [1]], sess.run(node_ids_list))
+      self.assertAllClose([[0.384314], [0.098013]], sess.run(gains_list))
+      self.assertAllEqual([[1], [1]], sess.run(thresholds_list))
+      self.assertAllClose([[[0.4852941]], [[-.6]]],
+                          sess.run(left_node_contribs_list))
+      self.assertAllClose([[[-0.75]], [[-0.014925]]],
+                          sess.run(right_node_contribs_list))
+
   def testMakeStatsSummarySimple(self):
     """Simple test for MakeStatsSummary."""
     with self.test_session():
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
index fd9be8c759..53a903c239 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
   }
   member_method {
     name: "evaluate"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
index 6b305be43f..ba17c90de2 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
   }
   member_method {
     name: "evaluate"
-- 
GitLab


From cbf1fc8ba96a6e9d6a36a2d09a82ea1ff9af2752 Mon Sep 17 00:00:00 2001
From: Younghee Kwon <youngheek@google.com>
Date: Mon, 16 Apr 2018 19:10:10 -0700
Subject: [PATCH 050/450] BoostedTreesEstimator in contrib: train_in_memory
 works with input_fns returning data.Dataset. Only one batch of data is
 expected, so dataset.batch() is disallowed, and dataset.repeat() will be
 ignored (only the first one would be used)

PiperOrigin-RevId: 193137094
---
 .../python/estimator/boosted_trees.py         |  38 +++-
 .../python/estimator/boosted_trees_test.py    |  80 +++++++--
 .../python/estimator/canned/boosted_trees.py  | 149 +++++++++++-----
 .../estimator/canned/boosted_trees_test.py    | 167 +++++++++++++++++-
 4 files changed, 362 insertions(+), 72 deletions(-)

diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
index 00356ce0ca..bd641014e9 100644
--- a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
@@ -17,10 +17,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator.canned import boosted_trees as canned_boosted_trees
 
 
+def _validate_input_fn_and_repeat_dataset(train_input_fn):
+  """Validates whether the input_fn is valid, and repeat() if tf.Dataset."""
+  def _input_fn():
+    result_input_fn = train_input_fn()
+    if isinstance(result_input_fn, dataset_ops.Dataset):
+      return result_input_fn.repeat()
+    return result_input_fn
+
+  return _input_fn
+
+
 class _BoostedTreesEstimator(estimator.Estimator):
   """An Estimator for Tensorflow Boosted Trees models."""
 
@@ -113,10 +125,13 @@ def boosted_trees_classifier_train_in_memory(
   bucketized_feature_2 = bucketized_column(
     numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
 
-  def input_fn_train():
+  def train_input_fn():
     dataset = create-dataset-from-training-data
-    # Don't use repeat or cache, since it is assumed to be one epoch
-    # This is either tf.data.Dataset, or a tuple of feature dict and label.
+    # This is tf.data.Dataset of a tuple of feature dict and label.
+    #   e.g. Dataset.zip((Dataset.from_tensors({'f1': f1_array, ...}),
+    #                     Dataset.from_tensors(label_array)))
+    # The returned Dataset shouldn't be batched.
+    # If Dataset repeats, only the first repetition would be used for training.
     return dataset
 
   classifier = boosted_trees_classifier_train_in_memory(
@@ -210,7 +225,9 @@ def boosted_trees_classifier_train_in_memory(
   in_memory_classifier = estimator.Estimator(
       model_fn=_model_fn, model_dir=model_dir, config=config)
 
-  in_memory_classifier.train(input_fn=train_input_fn, hooks=train_hooks)
+  in_memory_classifier.train(
+      input_fn=_validate_input_fn_and_repeat_dataset(train_input_fn),
+      hooks=train_hooks)
 
   return in_memory_classifier
   # pylint: enable=protected-access
@@ -241,10 +258,13 @@ def boosted_trees_regressor_train_in_memory(
   bucketized_feature_2 = bucketized_column(
     numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
 
-  def input_fn_train():
+  def train_input_fn():
     dataset = create-dataset-from-training-data
-    # Don't use repeat or cache, since it is assumed to be one epoch
-    # This is either tf.data.Dataset, or a tuple of feature dict and label.
+    # This is tf.data.Dataset of a tuple of feature dict and label.
+    #   e.g. Dataset.zip((Dataset.from_tensors({'f1': f1_array, ...}),
+    #                     Dataset.from_tensors(label_array)))
+    # The returned Dataset shouldn't be batched.
+    # If Dataset repeats, only the first repetition would be used for training.
     return dataset
 
   regressor = boosted_trees_regressor_train_in_memory(
@@ -329,7 +349,9 @@ def boosted_trees_regressor_train_in_memory(
   in_memory_regressor = estimator.Estimator(
       model_fn=_model_fn, model_dir=model_dir, config=config)
 
-  in_memory_regressor.train(input_fn=train_input_fn, hooks=train_hooks)
+  in_memory_regressor.train(
+      input_fn=_validate_input_fn_and_repeat_dataset(train_input_fn),
+      hooks=train_hooks)
 
   return in_memory_regressor
   # pylint: enable=protected-access
diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
index eee5910687..76cbefe5e9 100644
--- a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
@@ -21,6 +21,7 @@ import numpy as np
 
 from tensorflow.contrib.estimator.python.estimator import boosted_trees
 from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator.canned import boosted_trees as canned_boosted_trees
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column
@@ -49,12 +50,24 @@ def _make_train_input_fn(is_classification):
   """Makes train input_fn for classification/regression."""
 
   def _input_fn():
-    features = dict(FEATURES_DICT)
-    if is_classification:
-      labels = CLASSIFICATION_LABELS
-    else:
-      labels = REGRESSION_LABELS
-    return features, labels
+    features_dict = dict(FEATURES_DICT)
+    labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
+    return features_dict, labels
+
+  return _input_fn
+
+
+def _make_train_input_fn_dataset(is_classification):
+  """Makes input_fn using Dataset."""
+
+  def _input_fn():
+    features_dict = dict(FEATURES_DICT)
+    labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
+    ds = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.from_tensors(features_dict),
+         dataset_ops.Dataset.from_tensors(labels)
+        ))
+    return ds
 
   return _input_fn
 
@@ -132,15 +145,13 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
         x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
 
     est = boosted_trees.boosted_trees_classifier_train_in_memory(
-        train_input_fn=train_input_fn,
-        feature_columns=self._feature_columns,
-        n_trees=1,
-        max_depth=5)
+        train_input_fn=train_input_fn, feature_columns=self._feature_columns,
+        n_trees=1, max_depth=5)
     # It will stop after 5 steps because of the max depth and num trees.
     self._assert_checkpoint(
         est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
 
-    # Check eval.
+    # Check evaluate and predict.
     eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
     self.assertAllClose(eval_res['accuracy'], 1.0)
     # Validate predictions.
@@ -148,24 +159,59 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     self.assertAllClose([[0], [1], [1], [0], [0]],
                         [pred['class_ids'] for pred in predictions])
 
+  def testBinaryClassifierTrainInMemoryWithDataset(self):
+    train_input_fn = _make_train_input_fn_dataset(is_classification=True)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.boosted_trees_classifier_train_in_memory(
+        train_input_fn=train_input_fn, feature_columns=self._feature_columns,
+        n_trees=1, max_depth=5)
+    # It will stop after 5 steps because of the max depth and num trees.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+
+    # Check evaluate and predict.
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
+
   def testRegressorTrainInMemoryAndEvalAndInfer(self):
     train_input_fn = _make_train_input_fn(is_classification=False)
     predict_input_fn = numpy_io.numpy_input_fn(
         x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
 
     est = boosted_trees.boosted_trees_regressor_train_in_memory(
-        train_input_fn=train_input_fn,
-        feature_columns=self._feature_columns,
-        n_trees=1,
-        max_depth=5)
+        train_input_fn=train_input_fn, feature_columns=self._feature_columns,
+        n_trees=1, max_depth=5)
     # It will stop after 5 steps because of the max depth and num trees.
     self._assert_checkpoint(
         est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
 
-    # Check eval.
+    # Check evaluate and predict.
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testRegressorTrainInMemoryWithDataset(self):
+    train_input_fn = _make_train_input_fn_dataset(is_classification=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.boosted_trees_regressor_train_in_memory(
+        train_input_fn=train_input_fn, feature_columns=self._feature_columns,
+        n_trees=1, max_depth=5)
+    # It will stop after 5 steps because of the max depth and num trees.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    # Check evaluate and predict.
     eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
     self.assertAllClose(eval_res['average_loss'], 2.478283)
-    # Validate predictions.
     predictions = list(est.predict(input_fn=predict_input_fn))
     self.assertAllClose(
         [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 536bd2bf81..085dace1b3 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
@@ -50,6 +51,32 @@ _HOLD_FOR_MULTI_CLASS_SUPPORT = object()
 _HOLD_FOR_MULTI_DIM_SUPPORT = object()
 
 
+def _get_max_buckets(feature_columns):
+  """Gets the maximum number of buckets from feature_columns.
+
+  Args:
+    feature_columns: a list/set of tf.feature_column.
+
+  Returns:
+    max_buckets: the maximum number of buckets among bucketized_columns.
+
+  Raises:
+    ValueError: when unsupported feature_columns are given.
+  """
+  if not feature_columns:
+    raise ValueError('feature_columns must be a non-empty list/set of '
+                     'tf.feature_column.')
+  max_buckets = 1
+  for fc in feature_columns:
+    if isinstance(fc, feature_column_lib._BucketizedColumn):  # pylint:disable=protected-access
+      # N boundaries creates (N+1) buckets.
+      max_buckets = max(max_buckets, len(fc.boundaries) + 1)
+    else:
+      raise ValueError('For now, only bucketized_column is supported but '
+                       'got: {}'.format(fc))
+  return max_buckets
+
+
 def _get_transformed_features(features, feature_columns):
   """Gets the transformed features from features/feature_columns pair.
 
@@ -59,36 +86,31 @@ def _get_transformed_features(features, feature_columns):
 
   Returns:
     result_features: a list of the transformed features, sorted by the name.
-    num_buckets: the maximum number of buckets across bucketized_columns.
 
   Raises:
     ValueError: when unsupported features/columns are tried.
   """
-  num_buckets = 1
   # pylint:disable=protected-access
   for fc in feature_columns:
-    if isinstance(fc, feature_column_lib._BucketizedColumn):
-      # N boundaries creates (N+1) buckets.
-      num_buckets = max(num_buckets, len(fc.boundaries) + 1)
-    else:
+    if not isinstance(fc, feature_column_lib._BucketizedColumn):
       raise ValueError('For now, only bucketized_column is supported but '
                        'got: {}'.format(fc))
-  transformed = feature_column_lib._transform_features(features,
-                                                       feature_columns)
+  transformed_features = feature_column_lib._transform_features(
+      features, feature_columns)
   # pylint:enable=protected-access
   result_features = []
-  for column in sorted(transformed, key=lambda tc: tc.name):
+  for column in sorted(transformed_features, key=lambda tc: tc.name):
     source_name = column.source_column.name
-    squeezed_tensor = array_ops.squeeze(transformed[column], axis=1)
+    squeezed_tensor = array_ops.squeeze(transformed_features[column], axis=1)
     if len(squeezed_tensor.shape) > 1:
       raise ValueError('For now, only supports features equivalent to rank 1 '
                        'but column `{}` got: {}'.format(
                            source_name, features[source_name].shape))
     result_features.append(squeezed_tensor)
-  return result_features, num_buckets
+  return result_features
 
 
-def _keep_as_local_variable(tensor, name=None):
+def _local_variable(tensor, name=None):
   """Stores a tensor as a local Variable for faster read."""
   return variable_scope.variable(
       initial_value=tensor,
@@ -98,6 +120,48 @@ def _keep_as_local_variable(tensor, name=None):
       name=name)
 
 
+def _cache_transformed_features(features, feature_columns, batch_size):
+  """Transform features and cache, then returns (cached_features, cache_op)."""
+  num_features = len(feature_columns)
+  cached_features = [
+      _local_variable(
+          array_ops.zeros([batch_size], dtype=dtypes.int32),
+          name='cached_feature_{}'.format(i))
+      for i in range(num_features)
+  ]
+  are_features_cached = _local_variable(False, name='are_features_cached')
+
+  def cache_features_and_return():
+    """Caches transoformed features.
+
+    The intention is to hide get_transformed_features() from the graph by
+    caching the result except the first step, since bucketize operation
+    (inside get_transformed_features) is expensive.
+
+    Returns:
+      input_feature_list: a list of input features.
+      cache_flip_op: op to add to graph to make sure cache update is included to
+          the graph.
+    """
+
+    transformed_features = _get_transformed_features(features, feature_columns)
+    cached = [
+        state_ops.assign(cached_features[i], transformed_features[i])
+        for i in range(num_features)
+    ]
+    # TODO(youngheek): Try other combination of dependencies so that the
+    # function returns a single result, not a tuple.
+    with ops.control_dependencies(cached):
+      cache_flip_op = are_features_cached.assign(True)
+    return cached, cache_flip_op
+
+  input_feature_list, cache_flip_op = control_flow_ops.cond(
+      are_features_cached,
+      lambda: (cached_features, control_flow_ops.no_op()),
+      cache_features_and_return)
+  return input_feature_list, cache_flip_op
+
+
 class _CacheTrainingStatesUsingHashTable(object):
   """Caching logits, etc. using MutableHashTable."""
 
@@ -186,13 +250,13 @@ class _CacheTrainingStatesUsingVariables(object):
       logits_dimension: a constant (int) for the dimension of logits.
     """
     self._logits_dimension = logits_dimension
-    self._tree_ids = _keep_as_local_variable(
+    self._tree_ids = _local_variable(
         array_ops.zeros([batch_size], dtype=dtypes.int32),
         name='tree_ids_cache')
-    self._node_ids = _keep_as_local_variable(
+    self._node_ids = _local_variable(
         array_ops.zeros([batch_size], dtype=dtypes.int32),
         name='node_ids_cache')
-    self._logits = _keep_as_local_variable(
+    self._logits = _local_variable(
         array_ops.zeros([batch_size, logits_dimension], dtype=dtypes.float32),
         name='logits_cache')
 
@@ -290,33 +354,38 @@ def _bt_model_fn(
         'When train_in_memory is enabled, input_fn should return the entire '
         'dataset as a single batch, and n_batches_per_layer should be set as '
         '1.')
+    if (not config.is_chief or config.num_worker_replicas > 1 or
+        config.num_ps_replicas > 0):
+      raise ValueError('train_in_memory is supported only for '
+                       'non-distributed training.')
   worker_device = control_flow_ops.no_op().device
   # maximum number of splits possible in the whole tree =2^(D-1)-1
   # TODO(youngheek): perhaps storage could be optimized by storing stats with
   # the dimension max_splits_per_layer, instead of max_splits (for the entire
   # tree).
   max_splits = (1 << tree_hparams.max_depth) - 1
+  max_buckets = _get_max_buckets(feature_columns)
+  train_op = []
   with ops.name_scope(name) as name:
     # Prepare.
     global_step = training_util.get_or_create_global_step()
-    input_feature_list, num_buckets = _get_transformed_features(
-        features, feature_columns)
-    if train_in_memory and mode == model_fn.ModeKeys.TRAIN:
-      input_feature_list = [
-          _keep_as_local_variable(feature) for feature in input_feature_list
-      ]
-    num_features = len(input_feature_list)
-
-    cache = None
-    if mode == model_fn.ModeKeys.TRAIN:
-      if train_in_memory and is_single_machine:  # maybe just train_in_memory?
-        batch_size = array_ops.shape(input_feature_list[0])[0]
-        cache = _CacheTrainingStatesUsingVariables(batch_size,
-                                                   head.logits_dimension)
-      elif example_id_column_name:
+    num_features = len(feature_columns)
+    # Extract input features and set up cache for training.
+    training_state_cache = None
+    if mode == model_fn.ModeKeys.TRAIN and train_in_memory:
+      # cache transformed features as well for in-memory training.
+      batch_size = array_ops.shape(labels)[0]
+      input_feature_list, input_cache_op = _cache_transformed_features(
+          features, feature_columns, batch_size)
+      train_op.append(input_cache_op)
+      training_state_cache = _CacheTrainingStatesUsingVariables(
+          batch_size, head.logits_dimension)
+    else:
+      input_feature_list = _get_transformed_features(features, feature_columns)
+      if mode == model_fn.ModeKeys.TRAIN and example_id_column_name:
         example_ids = features[example_id_column_name]
-        cache = _CacheTrainingStatesUsingHashTable(example_ids,
-                                                   head.logits_dimension)
+        training_state_cache = _CacheTrainingStatesUsingHashTable(
+            example_ids, head.logits_dimension)
 
     # Create Ensemble resources.
     tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
@@ -340,11 +409,12 @@ def _bt_model_fn(
         # TODO(soroush): Do partial updates if this becomes a bottleneck.
         ensemble_reload = local_tree_ensemble.deserialize(
             *tree_ensemble.serialize())
-      if cache:
-        cached_tree_ids, cached_node_ids, cached_logits = cache.lookup()
+      if training_state_cache:
+        cached_tree_ids, cached_node_ids, cached_logits = (
+            training_state_cache.lookup())
       else:
         # Always start from the beginning when no cache is set up.
-        batch_size = array_ops.shape(input_feature_list[0])[0]
+        batch_size = array_ops.shape(labels)[0]
         cached_tree_ids, cached_node_ids, cached_logits = (
             array_ops.zeros([batch_size], dtype=dtypes.int32),
             array_ops.zeros([batch_size], dtype=dtypes.int32),
@@ -368,9 +438,8 @@ def _bt_model_fn(
     # Create training graph.
     def _train_op_fn(loss):
       """Run one training iteration."""
-      train_op = []
-      if cache:
-        train_op.append(cache.insert(tree_ids, node_ids, logits))
+      if training_state_cache:
+        train_op.append(training_state_cache.insert(tree_ids, node_ids, logits))
       if closed_form_grad_and_hess_fn:
         gradients, hessians = closed_form_grad_and_hess_fn(logits, labels)
       else:
@@ -385,7 +454,7 @@ def _bt_model_fn(
                   hessians=hessians,
                   bucketized_features_list=[input_feature_list[f]],
                   max_splits=max_splits,
-                  num_buckets=num_buckets),
+                  num_buckets=max_buckets),
               axis=0) for f in range(num_features)
       ]
 
@@ -422,7 +491,7 @@ def _bt_model_fn(
         summary_accumulator = data_flow_ops.ConditionalAccumulator(
             dtype=dtypes.float32,
             # The stats consist of gradients and hessians (the last dimension).
-            shape=[num_features, max_splits, num_buckets, 2],
+            shape=[num_features, max_splits, max_buckets, 2],
             shared_name='stats_summary_accumulator')
         apply_grad = summary_accumulator.apply_grad(
             array_ops.stack(stats_summary_list, axis=0), stamp_token)
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
index 56e67a6707..c8c52d3bc6 100644
--- a/tensorflow/python/estimator/canned/boosted_trees_test.py
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import model_fn
 from tensorflow.python.estimator import run_config
 from tensorflow.python.estimator.canned import boosted_trees
@@ -58,13 +59,32 @@ def _make_train_input_fn(is_classification):
   """Makes train input_fn for classification/regression."""
 
   def _input_fn():
-    features = dict(FEATURES_DICT)
-    features[EXAMPLE_ID_COLUMN] = constant_op.constant(EXAMPLE_IDS)
-    if is_classification:
-      labels = CLASSIFICATION_LABELS
+    features_dict = dict(FEATURES_DICT)
+    features_dict[EXAMPLE_ID_COLUMN] = constant_op.constant(EXAMPLE_IDS)
+    labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
+    return features_dict, labels
+
+  return _input_fn
+
+
+def _make_train_input_fn_dataset(is_classification, batch=None, repeat=None):
+  """Makes input_fn using Dataset."""
+
+  def _input_fn():
+    features_dict = dict(FEATURES_DICT)
+    features_dict[EXAMPLE_ID_COLUMN] = constant_op.constant(EXAMPLE_IDS)
+    labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
+    if batch:
+      ds = dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.from_tensor_slices(features_dict),
+           dataset_ops.Dataset.from_tensor_slices(labels))).batch(batch)
     else:
-      labels = REGRESSION_LABELS
-    return features, labels
+      ds = dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.from_tensors(features_dict),
+           dataset_ops.Dataset.from_tensors(labels)))
+    # repeat indefinitely by default, or stop at the given step.
+    ds = ds.repeat(repeat)
+    return ds
 
   return _input_fn
 
@@ -125,9 +145,28 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(train_input_fn, steps=num_steps)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
 
+  def testTrainClassifierWithDataset(self):
+    train_input_fn = _make_train_input_fn_dataset(is_classification=True)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesClassifier(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    est.train(train_input_fn, steps=100)  # will stop after 5 steps anyway.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
     predictions = list(est.predict(input_fn=predict_input_fn))
-    # All labels are correct.
     self.assertAllClose([[0], [1], [1], [0], [0]],
                         [pred['class_ids'] for pred in predictions])
 
@@ -166,12 +205,126 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     est.train(train_input_fn, steps=num_steps)
     self._assert_checkpoint(
         est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testTrainRegressorWithDataset(self):
+    train_input_fn = _make_train_input_fn_dataset(is_classification=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    est.train(train_input_fn, steps=100)  # will stop after 5 steps anyway.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testTrainRegressorWithDatasetBatch(self):
+    # The batch_size as the entire data size should yield the same result as
+    # dataset without batching.
+    train_input_fn = _make_train_input_fn_dataset(
+        is_classification=False, batch=5)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    est.train(train_input_fn, steps=100)  # will stop after 5 steps anyway.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testTrainRegressorWithDatasetLargerBatch(self):
+    # The batch_size as the multiple of the entire data size should still yield
+    # the same result.
+    train_input_fn = _make_train_input_fn_dataset(
+        is_classification=False, batch=15)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    est.train(train_input_fn, steps=100)  # will stop after 5 steps anyway.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testTrainRegressorWithDatasetSmallerBatch(self):
+    # Even when using small batches, if (n_batches_per_layer * batch_size) makes
+    # the same entire data size, the result should be the same.
+    train_input_fn = _make_train_input_fn_dataset(
+        is_classification=False, batch=1)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
 
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=5,
+        n_trees=1,
+        max_depth=5)
+    # Train stops after (n_batches_per_layer * n_trees * max_depth) steps.
+    est.train(train_input_fn, steps=100)
+    self._assert_checkpoint(
+        est.model_dir, global_step=25, finalized_trees=1, attempted_layers=5)
+    # 5 batches = one epoch.
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=5)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
     predictions = list(est.predict(input_fn=predict_input_fn))
     self.assertAllClose(
         [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
         [pred['predictions'] for pred in predictions])
 
+  def testTrainRegressorWithDatasetWhenInputIsOverEarlier(self):
+    train_input_fn = _make_train_input_fn_dataset(
+        is_classification=False, repeat=3)  # to stop input after 3 steps.
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    # Note that training will stop when input exhausts.
+    # This might not be a typical pattern, but dataset.repeat(3) causes
+    # the input stream to cease after 3 steps.
+    est.train(train_input_fn, steps=100)
+    self._assert_checkpoint(
+        est.model_dir, global_step=3, finalized_trees=0, attempted_layers=3)
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 3.777295)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.353850], [0.254100], [0.106850], [0.712100], [1.012100]],
+        [pred['predictions'] for pred in predictions])
+
 
 class ModelFnTests(test_util.TensorFlowTestCase):
   """Tests bt_model_fn including unexposed internal functionalities."""
-- 
GitLab


From 421d1c077053e6e38e4c9cee99641edcd4d9ca1e Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Wed, 11 Apr 2018 18:20:19 -0700
Subject: [PATCH 051/450] In model_to_estimator, only run get_weights when
 there are initialized Keras variables(which assumes there exists a session).
 Otherwise create a session so that we can run get_config(). Actually fix
 #18193.

PiperOrigin-RevId: 192541442
---
 .../python/keras/_impl/keras/estimator.py     | 45 +++++++++-----
 .../keras/_impl/keras/estimator_test.py       | 61 ++++++++++---------
 2 files changed, 63 insertions(+), 43 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/estimator.py b/tensorflow/python/keras/_impl/keras/estimator.py
index 8043242b70..b922a6c683 100644
--- a/tensorflow/python/keras/_impl/keras/estimator.py
+++ b/tensorflow/python/keras/_impl/keras/estimator.py
@@ -26,7 +26,6 @@ from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import export as export_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import run_config as run_config_lib
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
@@ -38,6 +37,7 @@ from tensorflow.python.keras._impl.keras.engine.network import Network
 from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_module
+from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import saver as saver_lib
@@ -55,6 +55,19 @@ def _cast_tensor_to_floatx(x):
     return math_ops.cast(x, K.floatx())
 
 
+def _any_variable_initalized():
+  """Check if any variable has been initialized in the Keras model.
+
+  Returns:
+    boolean, True if at least one variable has been initalized, else False.
+  """
+  variables = variables_module.global_variables()
+  for v in variables:
+    if getattr(v, '_keras_initialized', False):
+      return True
+  return False
+
+
 def _create_ordered_io(keras_model, estimator_io, is_input=True):
   """Create a list of tensors from IO dictionary based on Keras IO order.
 
@@ -396,7 +409,8 @@ def _save_first_checkpoint(keras_model, estimator, custom_objects,
                                      custom_objects)
       # save to checkpoint
       with session.Session(config=estimator._session_config) as sess:
-        model.set_weights(keras_weights)
+        if keras_weights:
+          model.set_weights(keras_weights)
         # Make update ops and initialize all variables.
         if not model.train_function:
           # pylint: disable=protected-access
@@ -466,20 +480,21 @@ def model_to_estimator(keras_model=None,
   estimator = estimator_lib.Estimator(
       keras_model_fn, model_dir=model_dir, config=config)
 
-  old_session = K._SESSION
-  # Pass the config into keras backend's default session.
-  sess = session.Session(config=estimator._session_config)
-  K.set_session(sess)
-  try:
-    keras_weights = keras_model.get_weights()
-  except errors.FailedPreconditionError as e:
-    if old_session is None:
-      raise e
-    logging.warning(
-        'The Keras backend session has already been '
-        'set. The _session_config passed to model_to_estimator is not used.')
-    K.set_session(old_session)
+  # Check if we need to call get_weights:
+  if _any_variable_initalized():
     keras_weights = keras_model.get_weights()
+    # Warn if config passed to estimator tries to update GPUOptions. If a
+    # session has already been created, the GPUOptions passed to the first
+    # session sticks.
+    if estimator._session_config.HasField('gpu_options'):
+      logging.warning(
+          'The Keras backend session has already been set. '
+          'The _session_config passed to model_to_estimator will not be used.')
+  else:
+    # Pass the config into keras backend's default session.
+    sess = session.Session(config=estimator._session_config)
+    K.set_session(sess)
+    keras_weights = None
 
   if keras_model._is_graph_network:
     # TODO(yifeif): move checkpoint initialization to scaffold.init_fn
diff --git a/tensorflow/python/keras/_impl/keras/estimator_test.py b/tensorflow/python/keras/_impl/keras/estimator_test.py
index 27b7ec7dd4..653cdc01e2 100644
--- a/tensorflow/python/keras/_impl/keras/estimator_test.py
+++ b/tensorflow/python/keras/_impl/keras/estimator_test.py
@@ -27,10 +27,12 @@ import numpy as np
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.keras._impl.keras.applications import mobilenet
+from tensorflow.python.keras._impl.keras.optimizers import SGD
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -443,8 +445,9 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
     model = simple_functional_model()
     model.compile(
         loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
-    est_keras = keras.estimator.model_to_estimator(
-        keras_model=model, config=self._config)
+    with self.test_session():
+      est_keras = keras.estimator.model_to_estimator(
+          keras_model=model, config=self._config)
 
     with self.test_session():
       with self.assertRaises(ValueError):
@@ -497,20 +500,22 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
             model_dir=tempfile.mkdtemp(dir=self._base_dir))
 
   def test_gpu_config(self):
-    keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        optimizer='rmsprop',
-        metrics=['mse', keras.metrics.categorical_accuracy])
+    with ops.Graph().as_default():
+      keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
+      keras_model.compile(
+          loss='categorical_crossentropy',
+          optimizer='rmsprop',
+          metrics=['mse', keras.metrics.categorical_accuracy])
 
-    gpu_options = config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.3)
-    sess_config = config_pb2.ConfigProto(gpu_options=gpu_options)
-    self._config._session_config = sess_config
-    keras.estimator.model_to_estimator(
-        keras_model=keras_model, config=self._config)
-    self.assertEqual(keras.backend.get_session()
-                     ._config.gpu_options.per_process_gpu_memory_fraction,
-                     gpu_options.per_process_gpu_memory_fraction)
+      gpu_options = config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.3)
+      sess_config = config_pb2.ConfigProto(gpu_options=gpu_options)
+      self._config._session_config = sess_config
+      keras.estimator.model_to_estimator(
+          keras_model=keras_model, config=self._config)
+      self.assertEqual(
+          keras.backend.get_session()
+          ._config.gpu_options.per_process_gpu_memory_fraction,
+          gpu_options.per_process_gpu_memory_fraction)
 
   def test_pretrained_weights(self):
     keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
@@ -518,19 +523,19 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
         loss='categorical_crossentropy',
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
         metrics=['mse', keras.metrics.categorical_accuracy])
-
-    keras_model.train_on_batch(
-        np.random.random((10,) + _INPUT_SIZE), np.random.random((10,
-                                                                 _NUM_CLASS)))
-    weights = keras_model.get_weights()
-    keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
-    keras_model.set_weights(weights)
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        optimizer=rmsprop.RMSPropOptimizer(1e-3),
-        metrics=['mse', keras.metrics.categorical_accuracy])
-    keras.estimator.model_to_estimator(
-        keras_model=keras_model, config=self._config)
+    with self.test_session():
+      keras_model.train_on_batch(
+          np.random.random((10,) + _INPUT_SIZE),
+          np.random.random((10, _NUM_CLASS)))
+      weights = keras_model.get_weights()
+      keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
+      keras_model.set_weights(weights)
+      keras_model.compile(
+          loss='categorical_crossentropy',
+          optimizer=SGD(lr=0.0001, momentum=0.9),
+          metrics=['mse', keras.metrics.categorical_accuracy])
+      keras.estimator.model_to_estimator(
+          keras_model=keras_model, config=self._config)
 
 
 if __name__ == '__main__':
-- 
GitLab


From ba25b8ba9f88df5db8c11c0bec9b27c8151af7d7 Mon Sep 17 00:00:00 2001
From: James Qin <jamesqin@google.com>
Date: Mon, 16 Apr 2018 14:52:41 -0700
Subject: [PATCH 052/450] Increase softmax gpu unittest numeric stability

PiperOrigin-RevId: 193103363
---
 tensorflow/python/kernel_tests/softmax_op_test.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index 981f96b74d..dc4d4dbeab 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -39,6 +39,10 @@ class SoftmaxTest(test.TestCase):
       dim = len(features.shape) - 1
     one_only_on_dim = list(features.shape)
     one_only_on_dim[dim] = 1
+    is_fp16 = features.dtype == np.float16
+    if is_fp16:
+      # Do the compute in fp32 and cast the input back to fp32.
+      features = features.astype(np.float32)
     e = np.exp(features - np.reshape(
         np.amax(
             features, axis=dim), one_only_on_dim))
@@ -47,6 +51,8 @@ class SoftmaxTest(test.TestCase):
       res = np.log(softmax)
     else:
       res = softmax
+    if is_fp16:
+      res = res.astype(np.float16)
     return res
 
   def _testSoftmax(self, np_features, dim=-1, log=False, use_gpu=False):
@@ -125,8 +131,8 @@ class SoftmaxTest(test.TestCase):
                        "Test only applicable when running on GPUs")
   def testFloatGPU(self):
     if test.is_gpu_available(cuda_only=True):
-      rows = [2**x + np.random.randint(0, 1024) for x in range(1, 10)]
-      cols = [2**x + np.random.randint(0, 1024) for x in range(1, 10)]
+      rows = [2**x + np.random.randint(0, 16) for x in range(1, 4)]
+      cols = [2**x + np.random.randint(0, 16) for x in range(1, 4)]
       for row, col in zip(rows, cols):
         logging.info("Testing softmax float dtype in shape [%d, %d]", row, col)
         data = np.random.rand(row, col)
@@ -140,8 +146,8 @@ class SoftmaxTest(test.TestCase):
                        "Test only applicable when running on GPUs")
   def testHalfGPU(self):
     if test.is_gpu_available(cuda_only=True):
-      rows = [2**x + np.random.randint(0, 1024) for x in range(1, 8)]
-      cols = [2**x + np.random.randint(0, 1024) for x in range(1, 8)]
+      rows = [2**x + np.random.randint(0, 16) for x in range(1, 4)]
+      cols = [2**x + np.random.randint(0, 16) for x in range(1, 4)]
       for row, col in zip(rows, cols):
         logging.info("Testing softmax half dtype in shape [%d, %d]", row, col)
         data = np.random.rand(row, col)
-- 
GitLab


From 5aba07dce5b9e924183efcd05cd82f2fbb70edc8 Mon Sep 17 00:00:00 2001
From: Chris Ying <chrisying@google.com>
Date: Tue, 17 Apr 2018 15:28:12 -0700
Subject: [PATCH 053/450] Fix CheckpointSaverHook to properly save every
 save_checkpoints_steps for TPU workloads.

PiperOrigin-RevId: 193266515
---
 .../contrib/tpu/python/tpu/tpu_estimator.py   |  9 ++
 .../training/basic_session_run_hooks.py       | 10 +-
 .../training/basic_session_run_hooks_test.py  | 93 +++++++++++++++++++
 ...sorflow.train.-checkpoint-saver-hook.pbtxt |  2 +-
 4 files changed, 111 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 7fab19afee..0948997b28 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -2054,6 +2054,14 @@ class TPUEstimator(estimator_lib.Estimator):
                   },
                   every_n_secs=30)
           ] + input_hooks
+          chief_hooks = [
+              training.CheckpointSaverHook(
+                  self.model_dir,
+                  save_secs=self._config.save_checkpoints_secs,
+                  save_steps=self._config.save_checkpoints_steps,
+                  steps_per_run=self._config.tpu_config.iterations_per_loop,
+                  scaffold=scaffold)
+          ]
           summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
           with ops.control_dependencies([loss]):
             update_ops = _sync_variables_ops()
@@ -2067,6 +2075,7 @@ class TPUEstimator(estimator_lib.Estimator):
           return model_fn_lib.EstimatorSpec(
               mode,
               loss=loss,
+              training_chief_hooks=chief_hooks,
               training_hooks=hooks,
               train_op=train_op,
               scaffold=scaffold)
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 094a9e886b..3651291bdf 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -391,7 +391,8 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
                saver=None,
                checkpoint_basename="model.ckpt",
                scaffold=None,
-               listeners=None):
+               listeners=None,
+               steps_per_run=1):
     """Initializes a `CheckpointSaverHook`.
 
     Args:
@@ -404,6 +405,9 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
       listeners: List of `CheckpointSaverListener` subclass instances.
         Used for callbacks that run immediately before or after this hook saves
         the checkpoint.
+      steps_per_run: `int`, number of steps that occur between each invocation
+        of the hook. Primarily used for TPU workloads which run multiple steps
+        in a while loop in a single Session.run.
 
     Raises:
       ValueError: One of `save_steps` or `save_secs` should be set.
@@ -419,6 +423,7 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
     self._timer = SecondOrStepTimer(every_secs=save_secs,
                                     every_steps=save_steps)
     self._listeners = listeners or []
+    self._steps_per_run = steps_per_run
 
   def begin(self):
     self._summary_writer = SummaryWriterCache.get(self._checkpoint_dir)
@@ -450,7 +455,8 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
 
   def after_run(self, run_context, run_values):
     stale_global_step = run_values.results
-    if self._timer.should_trigger_for_step(stale_global_step+1):
+    if self._timer.should_trigger_for_step(
+        stale_global_step + self._steps_per_run):
       # get the real value after train op.
       global_step = run_context.session.run(self._global_step_tensor)
       if self._timer.should_trigger_for_step(global_step):
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index f39a5261a9..25962f6bf7 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -719,6 +719,99 @@ class CheckpointSaverHookTest(test.TestCase):
     fake_summary_writer.FakeSummaryWriter.uninstall()
 
 
+class CheckpointSaverHookMultiStepTest(test.TestCase):
+
+  def setUp(self):
+    self.model_dir = tempfile.mkdtemp()
+    self.graph = ops.Graph()
+    self.steps_per_run = 5
+    with self.graph.as_default():
+      self.scaffold = monitored_session.Scaffold()
+      self.global_step = variables.get_or_create_global_step()
+      self.train_op = training_util._increment_global_step(self.steps_per_run)
+
+  def tearDown(self):
+    shutil.rmtree(self.model_dir, ignore_errors=True)
+
+  def test_save_steps_saves_in_first_step(self):
+    with self.graph.as_default():
+      hook = basic_session_run_hooks.CheckpointSaverHook(
+          self.model_dir,
+          save_steps=2*self.steps_per_run,
+          scaffold=self.scaffold,
+          steps_per_run=self.steps_per_run)
+      hook.begin()
+      self.scaffold.finalize()
+      with session_lib.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        mon_sess = monitored_session._HookedSession(sess, [hook])
+        mon_sess.run(self.train_op)
+        self.assertEqual(5,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+  def test_save_steps_saves_periodically(self):
+    with self.graph.as_default():
+      hook = basic_session_run_hooks.CheckpointSaverHook(
+          self.model_dir,
+          save_steps=2*self.steps_per_run,
+          scaffold=self.scaffold,
+          steps_per_run=self.steps_per_run)
+      hook.begin()
+      self.scaffold.finalize()
+      with session_lib.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        mon_sess = monitored_session._HookedSession(sess, [hook])
+        mon_sess.run(self.train_op)
+        # Saved (step=5)
+        self.assertEqual(5,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+        mon_sess.run(self.train_op)
+        # Not saved (step=10)
+        self.assertEqual(5,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+        mon_sess.run(self.train_op)
+        # Saved (step=15)
+        self.assertEqual(15,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+        mon_sess.run(self.train_op)
+        # Not saved (step=20)
+        self.assertEqual(15,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+        mon_sess.run(self.train_op)
+        # Saved (step=25)
+        self.assertEqual(25,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+  def test_save_steps_saves_at_end(self):
+    with self.graph.as_default():
+      hook = basic_session_run_hooks.CheckpointSaverHook(
+          self.model_dir,
+          save_steps=2*self.steps_per_run,
+          scaffold=self.scaffold,
+          steps_per_run=self.steps_per_run)
+      hook.begin()
+      self.scaffold.finalize()
+      with session_lib.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        mon_sess = monitored_session._HookedSession(sess, [hook])
+        mon_sess.run(self.train_op)
+        mon_sess.run(self.train_op)
+        hook.end(sess)
+        self.assertEqual(10,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+
 class ResourceCheckpointSaverHookTest(test.TestCase):
 
   def setUp(self):
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt
index c3037baa8c..327799729c 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'checkpoint_dir\', \'save_secs\', \'save_steps\', \'saver\', \'checkpoint_basename\', \'scaffold\', \'listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'model.ckpt\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'checkpoint_dir\', \'save_secs\', \'save_steps\', \'saver\', \'checkpoint_basename\', \'scaffold\', \'listeners\', \'steps_per_run\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'model.ckpt\', \'None\', \'None\', \'1\'], "
   }
   member_method {
     name: "after_create_session"
-- 
GitLab


From b5f8c3531924c56cf4866f57ce0ccea1b72b289e Mon Sep 17 00:00:00 2001
From: Yifei Feng <1192265+yifeif@users.noreply.github.com>
Date: Tue, 17 Apr 2018 10:53:07 -0700
Subject: [PATCH 054/450] Enable consumption of GIT_TAG_OVERRIDE env var in
 release build script. (#18579)

Enable consumption of GIT_TAG_OVERRIDE env var in release build script.
---
 tensorflow/contrib/cmake/tf_core_framework.cmake | 2 +-
 tensorflow/tools/ci_build/builds/pip.sh          | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index a1c320347f..73cadc58ff 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -276,7 +276,7 @@ add_custom_command(OUTPUT __force_rebuild COMMAND ${CMAKE_COMMAND} -E echo)
 add_custom_command(OUTPUT
     ${VERSION_INFO_CC}
     COMMAND ${PYTHON_EXECUTABLE} ${tensorflow_source_dir}/tensorflow/tools/git/gen_git_source.py
-    --raw_generate ${VERSION_INFO_CC}
+    ARGS --raw_generate ${VERSION_INFO_CC} --git_tag_override=${GIT_TAG_OVERRIDE}
     DEPENDS __force_rebuild)
 set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
 
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 82042b93c0..5fa75e1d61 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -123,6 +123,10 @@ done
 
 BAZEL_FLAGS=$(str_strip "${BAZEL_FLAGS}")
 
+if [[ -z "$GIT_TAG_OVERRIDE" ]]; then
+  BAZEL_FLAGS+=" --action_env=GIT_TAG_OVERRIDE"
+fi
+
 echo "Using Bazel flags: ${BAZEL_FLAGS}"
 
 PIP_BUILD_TARGET="//tensorflow/tools/pip_package:build_pip_package"
-- 
GitLab


From 6e9d3ad2aad1d6fc417882a7f5c7aba22b7df18e Mon Sep 17 00:00:00 2001
From: Derek Murray <derek.murray@gmail.com>
Date: Tue, 17 Apr 2018 14:59:58 -0700
Subject: [PATCH 055/450] [tf.data] Fix a device placement issue in
 `prefetch_to_device()`. (#18607)

* [tf.data] Fix a device placement issue in `prefetch_to_device()`.

Previously, the `iterator_get_device()` op was being infeasibly colocated with
both the iterator and placed on the prefetch target device. Move the
construction of that op outside the `with device():` block to fix this.

Also enable the relevant test to run as a CUDA test.

* Import the cuda_py_test rule.
---
 tensorflow/contrib/data/python/kernel_tests/BUILD     | 7 +++----
 tensorflow/contrib/data/python/ops/prefetching_ops.py | 6 ++++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 7270d533c6..fa5662ce0b 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -4,7 +4,7 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test", "py_test", "tf_py_test")
 
 py_test(
     name = "batch_dataset_op_test",
@@ -473,12 +473,11 @@ py_test(
     ],
 )
 
-py_test(
+cuda_py_test(
     name = "prefetching_ops_test",
     size = "small",
     srcs = ["prefetching_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         "//tensorflow/contrib/data/python/ops:prefetching_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/data/python/ops/prefetching_ops.py b/tensorflow/contrib/data/python/ops/prefetching_ops.py
index 89c04dc89a..e4c9f8b58a 100644
--- a/tensorflow/contrib/data/python/ops/prefetching_ops.py
+++ b/tensorflow/contrib/data/python/ops/prefetching_ops.py
@@ -114,11 +114,13 @@ class _PrefetchToDeviceIterator(object):
       ret = remote_iterator.get_next()
       return nest.flatten(sparse.serialize_sparse_tensors(ret))
 
+    iterator_device = gen_dataset_ops.iterator_get_device(
+        self._input_iterator._iterator_resource)
+
     with ops.device(device):
       self._buffering_resource = function_buffering_resource(
           f=_prefetch_fn,
-          target_device=gen_dataset_ops.iterator_get_device(
-              self._input_iterator._iterator_resource),
+          target_device=iterator_device,
           string_arg=input_iterator_handle,
           buffer_size=buffer_size,
           shared_name=shared_name)
-- 
GitLab


From 77586aefab8f5be9677659099ebe5467559c2d37 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Tue, 17 Apr 2018 16:18:07 -0700
Subject: [PATCH 056/450] Update version strings for 1.8.0rc1.

---
 tensorflow/core/public/version.h              |  2 +-
 tensorflow/docs_src/install/install_c.md      |  2 +-
 tensorflow/docs_src/install/install_go.md     |  2 +-
 tensorflow/docs_src/install/install_java.md   | 22 +++++++++----------
 tensorflow/docs_src/install/install_linux.md  | 22 +++++++++----------
 tensorflow/docs_src/install/install_mac.md    | 10 ++++-----
 .../docs_src/install/install_sources.md       |  4 ++--
 tensorflow/tools/pip_package/setup.py         |  2 +-
 8 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 0ca7d8475f..ba69efb289 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc0"
+#define TF_VERSION_SUFFIX "-rc1"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 995b8ae666..8c165aad52 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0-rc1.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 2938a8f7ee..26cbcc9a9b 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0-rc1.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index c87eacfa93..1b0bbdba7b 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.8.0-rc0</version>
+  <version>1.8.0-rc1</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.8.0-rc0</version>
+                 <version>1.8.0-rc1</version>
                </dependency>
              </dependencies>
          </project>
@@ -123,12 +123,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.8.0-rc0</version>
+  <version>1.8.0-rc1</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.8.0-rc0</version>
+  <version>1.8.0-rc1</version>
 </dependency>
 ```
 
@@ -147,7 +147,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc1.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -166,7 +166,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0-rc1.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -174,10 +174,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc1.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0-rc0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0-rc1.zip).
   3. Extract this .zip file.
 
 
@@ -225,7 +225,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.8.0-rc0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.8.0-rc1.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -239,11 +239,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.8.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.8.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 8387289fcf..f19f827e25 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -194,7 +194,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -299,7 +299,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -485,7 +485,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -659,14 +659,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -678,14 +678,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -697,14 +697,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -716,14 +716,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index a237d1af54..ff6c2f5e44 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -524,7 +524,7 @@ The value you specify depends on your Python version.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py2-none-any.whl
 </pre>
 
 
@@ -532,5 +532,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 677e3329b6..d48a6ee550 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -350,10 +350,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.8.0rc0 on Linux:
+for TensorFlow 1.8.0rc1 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0rc0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0rc1-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index f676f040ad..6da3223d33 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.8.0-rc0'
+_VERSION = '1.8.0-rc1'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
-- 
GitLab


From 9a6e21726e9978b9ab5442ad63dbc8037ec4a941 Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Tue, 17 Apr 2018 16:24:42 -0700
Subject: [PATCH 057/450] TpuStrategy -> TPUStrategy

PiperOrigin-RevId: 193275991
---
 tensorflow/contrib/distribute/python/combinations.py | 2 +-
 tensorflow/contrib/distribute/python/tpu_strategy.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index 1f66997e6e..946310aa6f 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -266,7 +266,7 @@ one_device_strategy = NamedDistribution(
     "OneDeviceCPU", one_device_strategy.OneDeviceStrategy("/cpu:0"),
     None)
 tpu_strategy = NamedDistribution(
-    "TPU", tpu_strategy.TpuStrategy(), required_tpu=True)
+    "TPU", tpu_strategy.TPUStrategy(), required_tpu=True)
 mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
     "MirroredCPUAndGPU",
     mirrored_strategy.MirroredStrategy(["/gpu:0", "/cpu:0"]), 1)
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index 0ac307dd6a..804217b5ce 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -32,10 +32,10 @@ from tensorflow.python.ops import control_flow_ops
 
 
 # TODO(isaprykin):  Consider whether inheriting is really appropriate.
-class TpuStrategy(one_device_strategy.OneDeviceStrategy):
+class TPUStrategy(one_device_strategy.OneDeviceStrategy):
 
   def __init__(self, master=None, iterations=None, model_dir=None):
-    super(TpuStrategy, self).__init__('/cpu:0')
+    super(TPUStrategy, self).__init__('/cpu:0')
 
   def _call_for_each_tower(self, fn, *args, **kwargs):
     kwargs.pop('run_concurrently', None)
-- 
GitLab


From ab16333f04df819fff34714b748010149443106d Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Tue, 17 Apr 2018 17:17:04 -0700
Subject: [PATCH 058/450] Minor cleanups to the gather expander; NFC

This change is NFC now, but it makes the code more general and this generality
will be used later on.  For instance

  ExpandFirstDimIntoNDims(transposed_gather_indices, {1, shape.dimensions(0)})

does not work if shape is a scalar shape (and this fine because today shape is
never scalar) but

  PrependDegenerateDims(transposed_gather_indices, 1)

works fine if transposed_gather_indices is scalar (and it will be, in a future
change).

PiperOrigin-RevId: 193283404
---
 tensorflow/compiler/xla/service/BUILD         |  17 ++
 .../compiler/xla/service/gather_expander.cc   |  15 +-
 .../xla/service/hlo_creation_utils.cc         |  13 +
 .../compiler/xla/service/hlo_creation_utils.h |  12 +-
 .../xla/service/hlo_creation_utils_test.cc    | 234 ++++++++++++++++++
 5 files changed, 279 insertions(+), 12 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/hlo_creation_utils_test.cc

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 9831a09c1f..9009cbf845 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1220,6 +1220,23 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "hlo_creation_utils_test",
+    srcs = ["hlo_creation_utils_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_creation_utils",
+        ":hlo_evaluator",
+        ":hlo_matchers",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "batchnorm_expander",
     srcs = ["batchnorm_expander.cc"],
diff --git a/tensorflow/compiler/xla/service/gather_expander.cc b/tensorflow/compiler/xla/service/gather_expander.cc
index 221ff7900f..1239f56364 100644
--- a/tensorflow/compiler/xla/service/gather_expander.cc
+++ b/tensorflow/compiler/xla/service/gather_expander.cc
@@ -86,8 +86,7 @@ static StatusOr<HloInstruction*> CanonicalizeGatherIndices(
   // all of the non-index-vector dimensions.
   const Shape& shape = transposed_gather_indices->shape();
   if (shape.dimensions_size() == 1) {
-    return ExpandFirstDimIntoNDims(transposed_gather_indices,
-                                   {1, shape.dimensions(0)});
+    return PrependDegenerateDims(transposed_gather_indices, 1);
   } else {
     return CollapseFirstNDims(transposed_gather_indices,
                               shape.dimensions_size() - 1);
@@ -112,11 +111,7 @@ static StatusOr<HloInstruction*> AdjustGatherDimsInAccumulator(
     // dynamic-slice.  In that case, there is a leading degenerate gather
     // dimension that we added to make this special case play well with the
     // general while loop which we need to remove now.
-    CHECK_EQ(accumulator->shape().dimensions(0), 1);
-    ArraySlice<int64> reshaped_dim_sizes =
-        AsInt64Slice(accumulator->shape().dimensions());
-    reshaped_dim_sizes.remove_prefix(1);
-    return MakeReshapeHlo(reshaped_dim_sizes, accumulator);
+    return ElideDegenerateDims(accumulator, {0});
   }
 
   return ExpandFirstDimIntoNDims(accumulator, output_gather_dim_bounds);
@@ -195,10 +190,8 @@ static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
                       MakeDynamicSliceHlo(operand, gathered_slice_start,
                                           gather.gather_window_bounds()));
 
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * gathered_slice_for_update,
-      ExpandFirstDimIntoNDims(gathered_slice,
-                              {1, gathered_slice->shape().dimensions(0)}));
+  TF_ASSIGN_OR_RETURN(HloInstruction * gathered_slice_for_update,
+                      PrependDegenerateDims(gathered_slice, 1));
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * index_vector_into_accumulator,
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
index b186767ce7..9a89888480 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -163,6 +163,8 @@ StatusOr<HloInstruction*> MakeConcatHlo(ArraySlice<HloInstruction*> operands,
 }
 
 StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand, int64 n) {
+  CHECK_GT(n, 0);
+
   const Shape& operand_shape = operand->shape();
   CHECK_GE(operand_shape.dimensions_size(), n);
   int64 new_shape_leading_bound = 1;
@@ -184,6 +186,17 @@ StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand, int64 n) {
   return MakeReshapeHlo(output_shape, operand);
 }
 
+StatusOr<HloInstruction*> PrependDegenerateDims(HloInstruction* operand,
+                                                int64 n) {
+  CHECK_GT(n, 0);
+  std::vector<int64> new_shape_dims;
+  const Shape& operand_shape = operand->shape();
+  new_shape_dims.reserve(n + operand_shape.dimensions_size());
+  new_shape_dims.insert(new_shape_dims.begin(), n, 1);
+  c_copy(operand_shape.dimensions(), std::back_inserter(new_shape_dims));
+  return MakeReshapeHlo(new_shape_dims, operand);
+}
+
 StatusOr<HloInstruction*> ExpandFirstDimIntoNDims(
     HloInstruction* operand, ArraySlice<int64> expanded_dims) {
   CHECK_GT(operand->shape().dimensions_size(), 0);
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h
index d99e32a737..c9a7361a6a 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.h
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h
@@ -103,12 +103,22 @@ StatusOr<HloInstruction*> MakeConcatHlo(
 // their operand(s).
 
 // Collapses (via reshape) the first N (logical) dimensions of `operand` into a
-// single leading dimension.  `operand` must have rank > n.
+// single leading dimension.  `operand` must have rank > `n` and `n` must not be
+// 0.
 //
 // For instance if `operand` has shape f32[7,8,9] and n is 2 then the output is
 // the `operand` reshaped to [56,9].
 StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand, int64 n);
 
+// Prepends `n` degenerate dimensions (dimensions with bound = 1) to `operand`
+// using a reshape.
+//
+// For instance if operand has shape f32[3,4,5] then this returns the operand
+// reshaped to f32[1,3,4,5].  If the operand is a f32 scalar (i.e. has shape
+// f32[]) then this returns the operand reshaped to f32[1].
+StatusOr<HloInstruction*> PrependDegenerateDims(HloInstruction* operand,
+                                                int64 n);
+
 // Expands (via reshape) the first (logical) dimension of `operand` into a
 // sequence of `expanded_dims` dimensions.  `operand` must at least be of rank 1
 // and the number of elements in its first dimension must be equal to the
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
new file mode 100644
index 0000000000..6b681a5bf6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
@@ -0,0 +1,234 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+using tensorflow::gtl::ArraySlice;
+
+std::unique_ptr<HloModule> CreateModuleWithProgramShape(
+    PrimitiveType primitive_type, ArraySlice<int64> input_shape_dims,
+    ArraySlice<int64> output_shape_dims, HloInstruction** param,
+    HloComputation** entry_computation) {
+  Shape input_shape = ShapeUtil::MakeShape(primitive_type, input_shape_dims);
+  Shape output_shape = ShapeUtil::MakeShape(primitive_type, output_shape_dims);
+  std::unique_ptr<HloModule> module = MakeUnique<HloModule>("test");
+  *entry_computation = module->AddEntryComputation(
+      CreateComputationWithSignature({&input_shape}, output_shape, "entry")
+          .ValueOrDie());
+  *param = (*entry_computation)->parameter_instruction(0);
+  return module;
+}
+
+TEST(HloCreationUtilsTest, CollapseFirst1Dim) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{2}, /*output_shape_dims=*/{2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_1_dims_collapsed,
+                          CollapseFirstNDims(param, 1));
+  entry_computation->set_root_instruction(first_1_dims_collapsed);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR1<int32>({3, 4})}));
+  CHECK_EQ(*result_literal, *Literal::CreateR1<int32>({3, 4}));
+}
+
+TEST(HloCreationUtilsTest, CollapseFirst2Dims) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{2, 3, 2}, /*output_shape_dims=*/{6, 2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_2_dims_collapsed,
+                          CollapseFirstNDims(param, 2));
+  entry_computation->set_root_instruction(first_2_dims_collapsed);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Literal> result_literal,
+      evaluator.Evaluate<std::unique_ptr<Literal>>(
+          *module,
+          {Literal::CreateR3<int32>(
+              {{{1, 2}, {3, 4}, {5, 6}}, {{-1, -2}, {-3, -4}, {-5, -6}}})}));
+  CHECK_EQ(*result_literal,
+           *Literal::CreateR2<int32>(
+               {{1, 2}, {3, 4}, {5, 6}, {-1, -2}, {-3, -4}, {-5, -6}}));
+}
+
+TEST(HloCreationUtilsTest, Prepend1DegenerateDim) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{2}, /*output_shape_dims=*/{1, 2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_1_degenerate_dim_prepended,
+                          PrependDegenerateDims(param, 1));
+  entry_computation->set_root_instruction(with_1_degenerate_dim_prepended);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR1<int32>({9, 10})}));
+  CHECK_EQ(*result_literal, *Literal::CreateR2<int32>({{9, 10}}));
+}
+
+TEST(HloCreationUtilsTest, Prepend2DegenerateDims) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{2}, /*output_shape_dims=*/{1, 1, 2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_2_degenerate_dims_prepended,
+                          PrependDegenerateDims(param, 2));
+  entry_computation->set_root_instruction(with_2_degenerate_dims_prepended);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR1<int32>({9, 10})}));
+  CHECK_EQ(*result_literal, *Literal::CreateR3<int32>({{{9, 10}}}));
+}
+
+TEST(HloCreationUtilsTest, Prepend2DegenerateDimsToScalar) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{}, /*output_shape_dims=*/{1, 1}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_2_degenerate_dims_prepended,
+                          PrependDegenerateDims(param, 2));
+  entry_computation->set_root_instruction(with_2_degenerate_dims_prepended);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR0<int32>(9)}));
+  CHECK_EQ(*result_literal, *Literal::CreateR2<int32>({{9}}));
+}
+
+TEST(HloCreationUtilsTest, ExpandFirstDimInto3Dims) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{6}, /*output_shape_dims=*/{3, 1, 2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_dim_expanded,
+                          ExpandFirstDimIntoNDims(param, {3, 1, 2}));
+  entry_computation->set_root_instruction(first_dim_expanded);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Literal> result_literal,
+      evaluator.Evaluate<std::unique_ptr<Literal>>(
+          *module, {Literal::CreateR1<int32>({1, 2, 3, 4, 5, 6})}));
+  CHECK_EQ(*result_literal,
+           *Literal::CreateR3<int32>({{{1, 2}}, {{3, 4}}, {{5, 6}}}));
+}
+
+TEST(HloCreationUtilsTest, PadVectorWithZeros) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{2}, /*output_shape_dims=*/{6}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloInstruction * zero_padded_param,
+      PadVectorWithZeros(param, /*zeros_to_prepend=*/3, /*zeros_to_append=*/1));
+  entry_computation->set_root_instruction(zero_padded_param);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR1<int32>({3, 4})}));
+  CHECK_EQ(*result_literal, *Literal::CreateR1<int32>({0, 0, 0, 3, 4, 0}));
+}
+
+TEST(HloCreationUtilsTest, BroadcastZeros_S32) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{}, /*output_shape_dims=*/{2, 2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloInstruction * zeros,
+      BroadcastZeros(module->entry_computation(), S32, {2, 2}));
+  entry_computation->set_root_instruction(zeros);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR0<int32>(0)}));
+  CHECK_EQ(*result_literal, *Literal::CreateR2<int32>({{0, 0}, {0, 0}}));
+}
+
+TEST(HloCreationUtilsTest, BroadcastZeros_F32) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      F32,
+      /*input_shape_dims=*/{}, /*output_shape_dims=*/{2, 2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloInstruction * zeros,
+      BroadcastZeros(module->entry_computation(), F32, {2, 2}));
+  entry_computation->set_root_instruction(zeros);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR0<float>(0.0f)}));
+  CHECK_EQ(*result_literal,
+           *Literal::CreateR2<float>({{0.0f, 0.0f}, {0.0f, 0.0f}}));
+}
+
+}  // namespace
+}  // namespace xla
-- 
GitLab


From 235a9e32c58c68f2ccf1552c1c2842b42f5c2cf0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 18:48:38 -0700
Subject: [PATCH 059/450] Comment out part of ring_reducer_test suspected to
 have a race.

PiperOrigin-RevId: 193292788
---
 tensorflow/core/common_runtime/ring_reducer_test.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index e4387a074a..57c36d6582 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -572,9 +572,9 @@ DEF_TEST(INT32, CPU, 2, 8, 3, 4095, 0)
 DEF_TEST(INT64, CPU, 1, 2, 1, 1001, 0)
 DEF_TEST(INT64, CPU, 2, 8, 3, 4095, 0)
 
-// Failure tests
-DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 7)
-DEF_TEST(FLOAT, CPU, 2, 8, 2, 9408, 11)
+// // Failure tests
+// DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 7)
+// DEF_TEST(FLOAT, CPU, 2, 8, 2, 9408, 11)
 #endif
 
 #ifdef GOOGLE_CUDA
@@ -597,9 +597,9 @@ DEF_TEST(DOUBLE, GPU, 1, 2, 1, 1001, 0)
 // DEF_TEST(INT32, GPU, 1, 2, 1, 1001, 0)
 DEF_TEST(INT64, GPU, 1, 2, 1, 1001, 0)
 
-// Failure tests
-DEF_TEST(FLOAT, GPU, 1, 8, 1, 9408, 2)
-DEF_TEST(FLOAT, GPU, 1, 8, 2, 9408, 5)
+// // Failure tests
+// DEF_TEST(FLOAT, GPU, 1, 8, 1, 9408, 2)
+// DEF_TEST(FLOAT, GPU, 1, 8, 2, 9408, 5)
 #endif
 
 }  // namespace
-- 
GitLab


From 41e2cd187b31e9e6d88bc042e21e73f7be0ed729 Mon Sep 17 00:00:00 2001
From: Chris Ying <chrisying@google.com>
Date: Tue, 17 Apr 2018 20:31:30 -0700
Subject: [PATCH 060/450] Disable CheckpointSaverHook when both
 save_checkpoints_secs and save_checkpoints_steps are None

PiperOrigin-RevId: 193299688
---
 .../contrib/tpu/python/tpu/tpu_estimator.py    | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 0948997b28..98eb0e240f 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -2054,14 +2054,16 @@ class TPUEstimator(estimator_lib.Estimator):
                   },
                   every_n_secs=30)
           ] + input_hooks
-          chief_hooks = [
-              training.CheckpointSaverHook(
-                  self.model_dir,
-                  save_secs=self._config.save_checkpoints_secs,
-                  save_steps=self._config.save_checkpoints_steps,
-                  steps_per_run=self._config.tpu_config.iterations_per_loop,
-                  scaffold=scaffold)
-          ]
+          chief_hooks = []
+          if (self._config.save_checkpoints_secs or
+              self._config.save_checkpoints_steps):
+            chief_hooks.append(
+                training.CheckpointSaverHook(
+                    self.model_dir,
+                    save_secs=self._config.save_checkpoints_secs,
+                    save_steps=self._config.save_checkpoints_steps,
+                    steps_per_run=self._config.tpu_config.iterations_per_loop,
+                    scaffold=scaffold))
           summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
           with ops.control_dependencies([loss]):
             update_ops = _sync_variables_ops()
-- 
GitLab


From d77a621a571d8ab0d69f2682586674e6dff4ec4e Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Tue, 17 Apr 2018 21:04:35 -0700
Subject: [PATCH 061/450] [XLA] Convert XLA to use xla::se as a namespace alias
 for ::stream_executor.

PiperOrigin-RevId: 193301997
---
 tensorflow/compiler/xla/BUILD                 |  3 +
 .../compiler/xla/client/client_library.cc     | 18 ++---
 .../compiler/xla/client/client_library.h      | 20 +++---
 .../compiler/xla/client/local_client.cc       |  2 -
 tensorflow/compiler/xla/client/local_client.h |  2 +-
 tensorflow/compiler/xla/device_util.h         |  2 +-
 .../compiler/xla/executable_run_options.h     |  7 +-
 tensorflow/compiler/xla/rpc/grpc_service.cc   |  2 +-
 tensorflow/compiler/xla/rpc/grpc_service.h    |  2 +-
 .../xla/service/allocation_tracker.cc         |  6 +-
 .../compiler/xla/service/allocation_tracker.h |  8 +--
 tensorflow/compiler/xla/service/backend.cc    | 19 ++---
 tensorflow/compiler/xla/service/backend.h     | 34 ++++-----
 .../xla/service/compile_only_service.cc       |  4 +-
 .../xla/service/compile_only_service.h        |  2 +-
 tensorflow/compiler/xla/service/compiler.cc   | 13 ++--
 tensorflow/compiler/xla/service/compiler.h    | 25 +++----
 .../xla/service/computation_placer.cc         | 16 ++---
 .../compiler/xla/service/computation_placer.h |  9 ++-
 .../compiler/xla/service/cpu/cpu_compiler.cc  | 14 ++--
 .../compiler/xla/service/cpu/cpu_compiler.h   | 12 ++--
 .../xla/service/cpu/cpu_executable.cc         |  9 +--
 .../compiler/xla/service/cpu/cpu_executable.h | 12 ++--
 .../xla/service/cpu/cpu_transfer_manager.cc   | 13 ++--
 .../xla/service/cpu/cpu_transfer_manager.h    | 25 ++++---
 .../service/cpu/parallel_cpu_executable.cc    |  4 +-
 .../xla/service/cpu/parallel_cpu_executable.h |  9 ++-
 .../xla/service/device_memory_allocator.cc    | 25 +++----
 .../xla/service/device_memory_allocator.h     | 28 ++++----
 tensorflow/compiler/xla/service/executable.cc |  6 +-
 tensorflow/compiler/xla/service/executable.h  |  2 +-
 .../xla/service/generic_transfer_manager.cc   | 13 ++--
 .../xla/service/generic_transfer_manager.h    | 35 ++++------
 .../xla/service/gpu/buffer_allocations.cc     |  2 -
 .../xla/service/gpu/buffer_allocations.h      | 21 +++---
 .../xla/service/gpu/conditional_thunk.cc      |  5 +-
 .../xla/service/gpu/conditional_thunk.h       |  2 +-
 .../xla/service/gpu/convolution_thunk.cc      |  2 -
 .../xla/service/gpu/convolution_thunk.h       | 24 +++----
 .../compiler/xla/service/gpu/copy_thunk.cc    | 12 ++--
 .../compiler/xla/service/gpu/copy_thunk.h     |  6 +-
 .../xla/service/gpu/cudnn_batchnorm_thunk.cc  |  1 -
 .../xla/service/gpu/cudnn_batchnorm_thunk.h   |  6 +-
 .../gpu/cudnn_convolution_algorithm_picker.cc |  2 -
 .../gpu/cudnn_convolution_algorithm_picker.h  |  7 +-
 .../service/gpu/cudnn_convolution_runner.cc   | 28 +++-----
 .../service/gpu/cudnn_convolution_runner.h    | 26 +++----
 .../compiler/xla/service/gpu/fft_thunk.cc     |  2 -
 .../compiler/xla/service/gpu/fft_thunk.h      | 17 +++--
 .../compiler/xla/service/gpu/for_thunk.cc     |  3 +-
 .../compiler/xla/service/gpu/for_thunk.h      |  3 +-
 .../compiler/xla/service/gpu/gemm_thunk.cc    |  2 -
 .../compiler/xla/service/gpu/gemm_thunk.h     |  9 +--
 .../compiler/xla/service/gpu/gpu_compiler.cc  |  8 +--
 .../compiler/xla/service/gpu/gpu_compiler.h   | 10 ++-
 .../xla/service/gpu/gpu_executable.cc         |  4 +-
 .../xla/service/gpu/gpu_transfer_manager.cc   |  6 +-
 .../xla/service/gpu/gpu_transfer_manager.h    | 11 ++-
 .../xla/service/gpu/infeed_manager.cc         |  2 -
 .../compiler/xla/service/gpu/infeed_manager.h | 17 ++---
 .../compiler/xla/service/gpu/infeed_thunk.cc  |  6 +-
 .../compiler/xla/service/gpu/infeed_thunk.h   |  2 +-
 .../xla/service/gpu/ir_emitter_context.h      |  6 +-
 .../compiler/xla/service/gpu/kernel_thunk.cc  |  2 -
 .../compiler/xla/service/gpu/kernel_thunk.h   | 11 ++-
 .../compiler/xla/service/gpu/memset_thunk.cc  |  2 -
 .../compiler/xla/service/gpu/memset_thunk.h   |  4 +-
 .../xla/service/gpu/partition_assignment.cc   |  2 -
 .../xla/service/gpu/partition_assignment.h    |  3 +-
 .../xla/service/gpu/sequential_thunk.cc       |  3 +-
 .../xla/service/gpu/sequential_thunk.h        |  3 +-
 tensorflow/compiler/xla/service/gpu/thunk.h   |  6 +-
 .../compiler/xla/service/gpu/tuple_thunk.cc   |  2 -
 .../compiler/xla/service/gpu/tuple_thunk.h    |  3 +-
 .../compiler/xla/service/gpu/while_thunk.cc   |  4 +-
 .../compiler/xla/service/gpu/while_thunk.h    |  2 +-
 .../xla/service/hlo_execution_profile.h       |  2 +-
 tensorflow/compiler/xla/service/hlo_runner.cc |  2 -
 tensorflow/compiler/xla/service/hlo_runner.h  |  5 +-
 .../xla/service/interpreter/compiler.cc       | 14 ++--
 .../xla/service/interpreter/compiler.h        | 11 ++-
 .../xla/service/interpreter/executable.cc     |  2 -
 .../xla/service/interpreter/executor.cc       |  6 +-
 .../xla/service/interpreter/executor.h        |  6 +-
 .../interpreter_transfer_manager.cc           |  7 +-
 .../xla/service/interpreter/platform.cc       | 18 ++---
 .../xla/service/interpreter/platform.h        |  6 +-
 .../xla/service/interpreter/platform_id.cc    |  6 +-
 .../xla/service/interpreter/platform_id.h     |  6 +-
 .../compiler/xla/service/llvm_compiler.cc     |  2 +-
 .../compiler/xla/service/llvm_compiler.h      |  7 +-
 .../compiler/xla/service/local_service.cc     |  4 +-
 .../compiler/xla/service/platform_util.cc     |  2 -
 .../compiler/xla/service/platform_util.h      | 16 ++---
 tensorflow/compiler/xla/service/service.cc    | 45 +++++-------
 tensorflow/compiler/xla/service/service.h     | 27 ++++----
 .../service/service_executable_run_options.h  |  7 +-
 .../compiler/xla/service/shaped_buffer.cc     |  4 +-
 .../compiler/xla/service/shaped_buffer.h      | 25 +++----
 .../compiler/xla/service/transfer_manager.cc  | 19 ++---
 .../compiler/xla/service/transfer_manager.h   | 69 +++++++++----------
 .../xla/tests/bitcast_convert_test.cc         |  2 +-
 .../xla/tests/client_library_test_base.cc     |  5 +-
 .../xla/tests/client_library_test_base.h      |  5 +-
 .../xla/tests/compute_constant_test.cc        |  8 +--
 tensorflow/compiler/xla/tests/convert_test.cc |  2 +-
 .../compiler/xla/tests/dynamic_ops_test.cc    |  2 -
 tensorflow/compiler/xla/tests/fusion_test.cc  |  2 -
 .../compiler/xla/tests/hlo_test_base.cc       |  2 -
 tensorflow/compiler/xla/tests/hlo_test_base.h |  3 +-
 .../compiler/xla/tests/llvm_compiler_test.cc  |  4 +-
 .../xla/tests/local_client_execute_test.cc    |  2 -
 .../xla/tests/local_client_test_base.cc       | 14 ++--
 .../xla/tests/local_client_test_base.h        | 18 +++--
 tensorflow/compiler/xla/tests/map_test.cc     |  2 +-
 tensorflow/compiler/xla/tests/test_utils.cc   |  4 +-
 tensorflow/compiler/xla/tests/test_utils.h    |  3 +-
 .../xla/tests/vector_ops_simple_test.cc       |  2 +-
 tensorflow/compiler/xla/tests/while_test.cc   |  2 -
 .../xla/tests/xla_hlo_profile_test.cc         |  2 +-
 tensorflow/compiler/xla/types.h               |  4 +-
 121 files changed, 443 insertions(+), 663 deletions(-)

diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 751777222f..88f37433a5 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -443,6 +443,9 @@ cc_library(
     srcs = ["executable_run_options.cc"],
     hdrs = ["executable_run_options.h"],
     visibility = ["//visibility:public"],
+    deps = [
+        ":types",
+    ],
 )
 
 cc_library(
diff --git a/tensorflow/compiler/xla/client/client_library.cc b/tensorflow/compiler/xla/client/client_library.cc
index b1663bc815..803a9e4009 100644
--- a/tensorflow/compiler/xla/client/client_library.cc
+++ b/tensorflow/compiler/xla/client/client_library.cc
@@ -23,22 +23,19 @@ limitations under the License.
 
 namespace xla {
 
-LocalClientOptions::LocalClientOptions(perftools::gputools::Platform* platform,
+LocalClientOptions::LocalClientOptions(se::Platform* platform,
                                        int number_of_replicas,
                                        int intra_op_parallelism_threads)
     : platform_(platform),
       number_of_replicas_(number_of_replicas),
       intra_op_parallelism_threads_(intra_op_parallelism_threads) {}
 
-LocalClientOptions& LocalClientOptions::set_platform(
-    perftools::gputools::Platform* platform) {
+LocalClientOptions& LocalClientOptions::set_platform(se::Platform* platform) {
   platform_ = platform;
   return *this;
 }
 
-perftools::gputools::Platform* LocalClientOptions::platform() const {
-  return platform_;
-}
+se::Platform* LocalClientOptions::platform() const { return platform_; }
 
 LocalClientOptions& LocalClientOptions::set_number_of_replicas(
     int number_of_replicas) {
@@ -69,7 +66,7 @@ ClientLibrary::ClientLibrary() = default;
 ClientLibrary::~ClientLibrary() = default;
 
 /* static */ StatusOr<LocalClient*> ClientLibrary::GetOrCreateLocalClient(
-    perftools::gputools::Platform* platform) {
+    se::Platform* platform) {
   LocalClientOptions default_options;
   default_options.set_platform(platform);
   return GetOrCreateLocalClient(default_options);
@@ -77,7 +74,7 @@ ClientLibrary::~ClientLibrary() = default;
 
 /* static */ StatusOr<LocalClient*> ClientLibrary::GetOrCreateLocalClient(
     const LocalClientOptions& options) {
-  perftools::gputools::Platform* platform = options.platform();
+  se::Platform* platform = options.platform();
   int replica_count = options.number_of_replicas();
   ClientLibrary& client_library = Singleton();
   tensorflow::mutex_lock lock(client_library.service_mutex_);
@@ -115,7 +112,7 @@ ClientLibrary::~ClientLibrary() = default;
 }
 
 /* static */ LocalService* ClientLibrary::GetXlaService(
-    perftools::gputools::Platform* platform) {
+    se::Platform* platform) {
   ClientLibrary& client_library = Singleton();
   tensorflow::mutex_lock lock(client_library.service_mutex_);
   auto it = client_library.local_instances_.find(platform->id());
@@ -124,8 +121,7 @@ ClientLibrary::~ClientLibrary() = default;
 }
 
 /* static */ StatusOr<CompileOnlyClient*>
-ClientLibrary::GetOrCreateCompileOnlyClient(
-    perftools::gputools::Platform* platform) {
+ClientLibrary::GetOrCreateCompileOnlyClient(se::Platform* platform) {
   ClientLibrary& client_library = Singleton();
   tensorflow::mutex_lock lock(client_library.service_mutex_);
 
diff --git a/tensorflow/compiler/xla/client/client_library.h b/tensorflow/compiler/xla/client/client_library.h
index a6f30d82e4..3ad558fa53 100644
--- a/tensorflow/compiler/xla/client/client_library.h
+++ b/tensorflow/compiler/xla/client/client_library.h
@@ -43,13 +43,13 @@ namespace xla {
 // Options to configure the local client when it is created.
 class LocalClientOptions {
  public:
-  LocalClientOptions(perftools::gputools::Platform* platform = nullptr,
+  LocalClientOptions(se::Platform* platform = nullptr,
                      int number_of_replicas = 1,
                      int intra_op_parallelism_threads = -1);
 
   // Set the platform backing the service, or nullptr for the default platform.
-  LocalClientOptions& set_platform(perftools::gputools::Platform* platform);
-  perftools::gputools::Platform* platform() const;
+  LocalClientOptions& set_platform(se::Platform* platform);
+  se::Platform* platform() const;
 
   // Set the number of replicas to use when compiling replicated
   // programs.
@@ -61,7 +61,7 @@ class LocalClientOptions {
   int intra_op_parallelism_threads() const;
 
  private:
-  perftools::gputools::Platform* platform_;
+  se::Platform* platform_;
   int number_of_replicas_;
   int intra_op_parallelism_threads_;
 };
@@ -74,7 +74,7 @@ class ClientLibrary {
   //   platform : The platform the underlying XLA service should target. If
   //     null then default platform is used.
   static StatusOr<LocalClient*> GetOrCreateLocalClient(
-      perftools::gputools::Platform* platform = nullptr);
+      se::Platform* platform = nullptr);
   static StatusOr<LocalClient*> GetOrCreateLocalClient(
       const LocalClientOptions& options);
 
@@ -84,14 +84,14 @@ class ClientLibrary {
 
   // Returns the service from the service thread. Only used in unit tests to
   // access user computations from client.
-  static LocalService* GetXlaService(perftools::gputools::Platform* platform);
+  static LocalService* GetXlaService(se::Platform* platform);
 
   // Singleton constructor-or-accessor for compile-only clients. Arguments:
   //
   //   platform : The platform the underlying XLA service should target. If
   //     null then default platform is used.
   static StatusOr<CompileOnlyClient*> GetOrCreateCompileOnlyClient(
-      perftools::gputools::Platform* platform = nullptr);
+      se::Platform* platform = nullptr);
 
   // Clears the local instance and compile only instance caches. The client
   // pointers returned by the previous GetOrCreateLocalClient() or
@@ -120,12 +120,10 @@ class ClientLibrary {
   };
 
   tensorflow::mutex service_mutex_;  // Guards the singleton creation state.
-  std::unordered_map<perftools::gputools::Platform::Id,
-                     std::unique_ptr<LocalInstance>>
+  std::unordered_map<se::Platform::Id, std::unique_ptr<LocalInstance>>
       local_instances_ GUARDED_BY(service_mutex_);
 
-  std::unordered_map<perftools::gputools::Platform::Id,
-                     std::unique_ptr<CompileOnlyInstance>>
+  std::unordered_map<se::Platform::Id, std::unique_ptr<CompileOnlyInstance>>
       compile_only_instances_ GUARDED_BY(service_mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(ClientLibrary);
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 30594243dc..d951c44cb9 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -24,8 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/source_map_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
-namespace se = ::perftools::gputools;
-
 using xla::source_map_util::InvalidParameterArgument;
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 98ee7c62c9..42812b936f 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -167,7 +167,7 @@ class LocalClient : public Client {
   StatusOr<int> ReplicaNumberToDeviceOrdinal(int replica_number);
 
   // Returns the platform that the underlying service targets.
-  perftools::gputools::Platform* platform() const;
+  se::Platform* platform() const;
 
   // Returns the number of devices on the system of the service platform
   // type. Not all devices may be supported by the service (see
diff --git a/tensorflow/compiler/xla/device_util.h b/tensorflow/compiler/xla/device_util.h
index 23a622b1ad..1a51fdee68 100644
--- a/tensorflow/compiler/xla/device_util.h
+++ b/tensorflow/compiler/xla/device_util.h
@@ -29,7 +29,7 @@ namespace xla {
 
 // Returns a string that represents the device in terms of platform and ordinal;
 // e.g. the first CUDA device will be "cuda:0"
-string DeviceIdentifier(perftools::gputools::StreamExecutor* stream_exec) {
+string DeviceIdentifier(se::StreamExecutor* stream_exec) {
   return tensorflow::strings::StrCat(stream_exec->platform()->Name(), ":",
                                      stream_exec->device_ordinal());
 }
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index 1a095a82cc..a306ae16ba 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 #define TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 
+// Pulls in the ::stream_executor -> ::xla::se namespace alias.
+#include "tensorflow/compiler/xla/types.h"
+
 // These classes are forward declared so that ExecutableRunOptions can be linked
 // into an XLA-compiled binary without having to link all of the pointed-to
 // objects (e.g., for an ahead-of-time compiled CPU binary, the gpu tools don't
@@ -37,10 +40,6 @@ struct ThreadPoolDevice;
 
 namespace xla {
 
-// TODO(b/77980417): Once the perftools::gputools -> stream_executor migration
-// is complete, add "using namespace se = stream_executor" here and
-// s/stream_executor/se::/ to match our idiom elsewhere.
-
 class DeviceMemoryAllocator;
 class DeviceAssignment;
 class ExecutionProfile;
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.cc b/tensorflow/compiler/xla/rpc/grpc_service.cc
index 414829d6e7..0b100bd108 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_service.cc
@@ -20,7 +20,7 @@ limitations under the License.
 namespace xla {
 
 /* static */ StatusOr<std::unique_ptr<GRPCService>> GRPCService::NewService(
-    perftools::gputools::Platform* platform) {
+    se::Platform* platform) {
   std::unique_ptr<GRPCService> grpc_service(new GRPCService());
   TF_ASSIGN_OR_RETURN(grpc_service->service_,
                       ::xla::Service::NewService(platform));
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.h b/tensorflow/compiler/xla/rpc/grpc_service.h
index 7c9e484517..fad74375bd 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service.h
+++ b/tensorflow/compiler/xla/rpc/grpc_service.h
@@ -29,7 +29,7 @@ class GRPCService : public grpc::XlaService::Service {
   // that the service should target. If platform is null then the default
   // platform is used.
   static StatusOr<std::unique_ptr<GRPCService>> NewService(
-      perftools::gputools::Platform* platform = nullptr);
+      se::Platform* platform = nullptr);
 
   ::grpc::Status Computation(::grpc::ServerContext* context,
                              const ComputationRequest* arg,
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index 4f819a743c..359582a78c 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -204,7 +204,7 @@ StatusOr<std::vector<const ShapedBuffer*>> AllocationTracker::ResolveInternal(
 }
 
 void AllocationTracker::AddAllocationOrIncrementRefCount(
-    perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) {
+    se::DeviceMemoryBase device_memory, int device_ordinal) {
   AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal];
   auto it = allocation_map.find(device_memory.opaque());
   if (it == allocation_map.end()) {
@@ -215,8 +215,8 @@ void AllocationTracker::AddAllocationOrIncrementRefCount(
   }
 }
 
-Status AllocationTracker::DecrementRefCount(
-    perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) {
+Status AllocationTracker::DecrementRefCount(se::DeviceMemoryBase device_memory,
+                                            int device_ordinal) {
   AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal];
   auto it = allocation_map.find(device_memory.opaque());
   TF_RET_CHECK(it != allocation_map.end());
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h
index 038aee8541..60e93358ef 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@@ -77,7 +77,7 @@ class AllocationTracker {
   // Data structure encapsulating single memory allocation on the device.
   struct Allocation {
     // The pointer to this allocation.
-    perftools::gputools::DeviceMemoryBase device_memory;
+    se::DeviceMemoryBase device_memory;
 
     // The device that the memory is allocated on.
     int device_ordinal;
@@ -103,13 +103,13 @@ class AllocationTracker {
 
   // Adds the given device address to the allocation tracker, or if it already
   // exists, then increment it's reference count.
-  void AddAllocationOrIncrementRefCount(
-      perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal)
+  void AddAllocationOrIncrementRefCount(se::DeviceMemoryBase device_memory,
+                                        int device_ordinal)
       EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Decrements the reference count of the given device memory. Then, if it is
   // zero, deallocate the memory.
-  Status DecrementRefCount(perftools::gputools::DeviceMemoryBase device_memory,
+  Status DecrementRefCount(se::DeviceMemoryBase device_memory,
                            int device_ordinal) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // A map from device memory opaque value to allocation. One such map is
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 05f2d06278..a582dbffd6 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -36,19 +36,14 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
-BackendOptions& BackendOptions::set_platform(
-    perftools::gputools::Platform* platform) {
+BackendOptions& BackendOptions::set_platform(se::Platform* platform) {
   platform_ = platform;
   return *this;
 }
 
-perftools::gputools::Platform* BackendOptions::platform() const {
-  return platform_;
-}
+se::Platform* BackendOptions::platform() const { return platform_; }
 
 BackendOptions& BackendOptions::set_intra_op_parallelism_threads(
     int num_threads) {
@@ -77,7 +72,7 @@ struct Backend::EigenThreadPoolWrapper {
 
 /* static */ StatusOr<std::unique_ptr<Backend>> Backend::CreateBackend(
     const BackendOptions& options) {
-  perftools::gputools::Platform* platform = options.platform();
+  se::Platform* platform = options.platform();
   TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform));
   TF_ASSIGN_OR_RETURN(auto stream_executors,
                       PlatformUtil::GetStreamExecutors(platform));
@@ -121,7 +116,7 @@ StatusOr<Backend::StreamPtr> Backend::BorrowStream(
 }
 
 Backend::Backend(
-    perftools::gputools::Platform* platform, Compiler* compiler,
+    se::Platform* platform, Compiler* compiler,
     tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors,
     TransferManager* transfer_manager, ComputationPlacer* computation_placer,
     int intra_op_parallelism_threads)
@@ -178,7 +173,7 @@ tensorflow::thread::ThreadPool* Backend::eigen_intra_op_thread_pool() const {
   return intra_op_thread_pool_wrapper_->pool.get();
 }
 
-StatusOr<perftools::gputools::StreamExecutor*> Backend::stream_executor(
+StatusOr<se::StreamExecutor*> Backend::stream_executor(
     int device_ordinal) const {
   if (device_ordinal < 0 ||
       device_ordinal > stream_executors_.back()->device_ordinal()) {
@@ -201,9 +196,9 @@ StatusOr<bool> Backend::devices_equivalent(int device_ordinal_a,
   // bit crude but works for GPUs which is the important case where we compile
   // an executable for one GPU and want to know if it will run (well) on
   // another.
-  TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * executor_a,
+  TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor_a,
                       stream_executor(device_ordinal_a));
-  TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * executor_b,
+  TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor_b,
                       stream_executor(device_ordinal_b));
   return (executor_a->GetDeviceDescription().name() ==
           executor_b->GetDeviceDescription().name());
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index b5ca483b72..d32a0a400d 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -44,8 +44,8 @@ namespace xla {
 class BackendOptions {
  public:
   // Set the platform backing the backend, or nullptr for the default platform.
-  BackendOptions& set_platform(perftools::gputools::Platform* platform);
-  perftools::gputools::Platform* platform() const;
+  BackendOptions& set_platform(se::Platform* platform);
+  se::Platform* platform() const;
 
   // Sets the thread pool size for parallel execution of an individual operator.
   // The default value of -1 will result in initializing the thread pool with
@@ -54,7 +54,7 @@ class BackendOptions {
   int intra_op_parallelism_threads() const;
 
  private:
-  perftools::gputools::Platform* platform_ = nullptr;
+  se::Platform* platform_ = nullptr;
   int intra_op_parallelism_threads_ = -1;
 };
 
@@ -66,7 +66,7 @@ class BackendOptions {
 //    StreamPtr stream = backend->BorrowStream().ConsumeValueOrDie();
 class Backend {
  public:
-  using StreamPtr = Pool<perftools::gputools::Stream>::SmartPtr;
+  using StreamPtr = Pool<se::Stream>::SmartPtr;
 
   // Creates a new backend.
   static StatusOr<std::unique_ptr<Backend>> CreateBackend(
@@ -79,7 +79,7 @@ class Backend {
   ~Backend();
 
   // Accessors for the various objects.
-  perftools::gputools::Platform* platform() const { return platform_; }
+  se::Platform* platform() const { return platform_; }
   Compiler* compiler() const { return compiler_; }
   DeviceMemoryAllocator* memory_allocator() const {
     return memory_allocator_.get();
@@ -96,19 +96,17 @@ class Backend {
 
   // Returns stream executors of all supported devices for this backend. The
   // executors are ordered by the device ordinal.
-  const std::vector<perftools::gputools::StreamExecutor*>& stream_executors()
-      const {
+  const std::vector<se::StreamExecutor*>& stream_executors() const {
     return stream_executors_;
   }
 
   // Returns the stream executor for the given device ordinal.
-  StatusOr<perftools::gputools::StreamExecutor*> stream_executor(
-      int device_ordinal) const;
+  StatusOr<se::StreamExecutor*> stream_executor(int device_ordinal) const;
 
   // Returns the stream executor for the default device ordinal. This stream
   // executor can only be used when the number of computations is 1 (replication
   // can be > 1).
-  perftools::gputools::StreamExecutor* default_stream_executor() const {
+  se::StreamExecutor* default_stream_executor() const {
     CHECK(!stream_executors_.empty());
     return stream_executors_[0];
   }
@@ -117,8 +115,7 @@ class Backend {
   // internal pool, or by constructing/initializating it, and returns the result
   // to the caller.
   StatusOr<StreamPtr> BorrowStream(int device_ordinal);
-  StatusOr<StreamPtr> BorrowStream(
-      perftools::gputools::StreamExecutor* executor);
+  StatusOr<StreamPtr> BorrowStream(se::StreamExecutor* executor);
 
   // Returns a function to borrow a stream, as `BorrowStream` above does.
   // Purely for convenience, the caller could rather make this anonymous
@@ -157,29 +154,26 @@ class Backend {
 
  private:
   struct EigenThreadPoolWrapper;
-  Backend(perftools::gputools::Platform* platform, Compiler* compiler,
-          tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-              stream_executors,
+  Backend(se::Platform* platform, Compiler* compiler,
+          tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors,
           TransferManager* transfer_manager,
           ComputationPlacer* computation_placer,
           int intra_op_parallelism_threads);
   Backend(const Backend&) = delete;
   Backend& operator=(const Backend&) = delete;
 
-  perftools::gputools::Platform* platform_;
+  se::Platform* platform_;
   Compiler* compiler_;
   TransferManager* transfer_manager_;
   ComputationPlacer* computation_placer_;
 
   // Vector of stream executors. stream_executors_[0] is the default executor.
-  std::vector<perftools::gputools::StreamExecutor*> stream_executors_;
+  std::vector<se::StreamExecutor*> stream_executors_;
 
   tensorflow::mutex mu_;
 
   // Mapping from stream executor to stream pools, used by `BorrowStream` above.
-  std::map<perftools::gputools::StreamExecutor*,
-           Pool<perftools::gputools::Stream>>
-      stream_pools_ GUARDED_BY(mu_);
+  std::map<se::StreamExecutor*, Pool<se::Stream>> stream_pools_ GUARDED_BY(mu_);
 
   // The default memory allocator to use.
   std::unique_ptr<StreamExecutorMemoryAllocator> memory_allocator_;
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index fb70ea5315..c9f78a0f9f 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -37,7 +37,7 @@ limitations under the License.
 namespace xla {
 
 /* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
-CompileOnlyService::NewService(perftools::gputools::Platform* platform) {
+CompileOnlyService::NewService(se::Platform* platform) {
   ServiceOptions default_options;
   default_options.set_platform(platform);
   return NewService(default_options);
@@ -45,7 +45,7 @@ CompileOnlyService::NewService(perftools::gputools::Platform* platform) {
 
 /* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
 CompileOnlyService::NewService(const ServiceOptions& options) {
-  perftools::gputools::Platform* platform = options.platform();
+  se::Platform* platform = options.platform();
   if (platform == nullptr) {
     TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
   }
diff --git a/tensorflow/compiler/xla/service/compile_only_service.h b/tensorflow/compiler/xla/service/compile_only_service.h
index dd8de42a0f..c10609e67f 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.h
+++ b/tensorflow/compiler/xla/service/compile_only_service.h
@@ -34,7 +34,7 @@ class CompileOnlyService : public Service {
   // platform that the service should target. If platform is null then the
   // default platform is used.
   static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
-      perftools::gputools::Platform* platform);
+      se::Platform* platform);
   static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
       const ServiceOptions& options);
 
diff --git a/tensorflow/compiler/xla/service/compiler.cc b/tensorflow/compiler/xla/service/compiler.cc
index 0392d4af48..8b01a6c4b5 100644
--- a/tensorflow/compiler/xla/service/compiler.cc
+++ b/tensorflow/compiler/xla/service/compiler.cc
@@ -23,26 +23,21 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 /* static */ tensorflow::mutex Compiler::platform_compiler_mutex_(
     tensorflow::LINKER_INITIALIZED);
 
-/* static */ std::map<perftools::gputools::Platform::Id,
-                      Compiler::CompilerFactory>*
+/* static */ std::map<se::Platform::Id, Compiler::CompilerFactory>*
 Compiler::GetPlatformCompilerFactories() {
-  static auto* r =
-      new std::map<perftools::gputools::Platform::Id, CompilerFactory>;
+  static auto* r = new std::map<se::Platform::Id, CompilerFactory>;
   return r;
 }
 
 /* static */
-std::map<perftools::gputools::Platform::Id, std::unique_ptr<Compiler>>*
+std::map<se::Platform::Id, std::unique_ptr<Compiler>>*
 Compiler::GetPlatformCompilers() {
-  static auto* r = new std::map<perftools::gputools::Platform::Id,
-                                std::unique_ptr<Compiler>>;
+  static auto* r = new std::map<se::Platform::Id, std::unique_ptr<Compiler>>;
   return r;
 }
 
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index b4b53ae2ed..5c14591d93 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -70,7 +70,7 @@ class AotCompilationOptions {
   virtual ~AotCompilationOptions() = default;
 
   // Returns the ID of the platform to which these options apply.
-  virtual perftools::gputools::Platform::Id PlatformId() const = 0;
+  virtual se::Platform::Id PlatformId() const = 0;
 
   // Optional allocator that may be used for allocating temp space on the device
   // during compilation.
@@ -109,7 +109,7 @@ class Compiler {
   virtual ~Compiler() {}
 
   // Returns the ID of the platform that this compiler targets.
-  virtual perftools::gputools::Platform::Id PlatformId() const = 0;
+  virtual se::Platform::Id PlatformId() const = 0;
 
   // Runs Hlo passes to optimize the given Hlo module, returns the optimized
   // module.
@@ -120,8 +120,7 @@ class Compiler {
   // algorithm over those buffers, to see which variant is fastest.  Any space
   // allocated should be deallocated before this function returns.
   virtual StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* executor,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
       DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles the HLO module for execution on a device given by the executor,
@@ -137,8 +136,7 @@ class Compiler {
   //
   // Use the overload below to compile computations that run in parallel.
   virtual StatusOr<std::unique_ptr<Executable>> RunBackend(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* executor,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
       DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles a set of HLO modules that can run in parallel, potentially
@@ -151,8 +149,7 @@ class Compiler {
   // modules to RunHloPasses and RunBackends.
   virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> modules,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_exec,
+      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
       DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles the HLO module for ahead-of-time execution.  This is intended for
@@ -171,14 +168,12 @@ class Compiler {
   // be a singleton, so no ownership is transferred.
   //
   // Precondition: a platform kind must not be registered more than once.
-  static void RegisterCompilerFactory(
-      perftools::gputools::Platform::Id platform_id,
-      CompilerFactory compiler_factory);
+  static void RegisterCompilerFactory(se::Platform::Id platform_id,
+                                      CompilerFactory compiler_factory);
 
   // Returns the compiler singleton pointer if it is available for the given
   // platform, or an error status if it is not.
-  static StatusOr<Compiler*> GetForPlatform(
-      const perftools::gputools::Platform* platform);
+  static StatusOr<Compiler*> GetForPlatform(const se::Platform* platform);
 
   // Returns a function that computes the size in bytes of the logical
   // buffer that contains a shape.
@@ -198,12 +193,12 @@ class Compiler {
   static tensorflow::mutex platform_compiler_mutex_;
 
   // Map from platform kind to compiler factory.
-  static std::map<perftools::gputools::Platform::Id, CompilerFactory>*
+  static std::map<se::Platform::Id, CompilerFactory>*
   GetPlatformCompilerFactories();
 
   // Map from platform kind to compiler instance, if we made one already (based
   // on the factories above).
-  static std::map<perftools::gputools::Platform::Id, std::unique_ptr<Compiler>>*
+  static std::map<se::Platform::Id, std::unique_ptr<Compiler>>*
   GetPlatformCompilers();
 };
 
diff --git a/tensorflow/compiler/xla/service/computation_placer.cc b/tensorflow/compiler/xla/service/computation_placer.cc
index 657fba6b62..7c1bacff92 100644
--- a/tensorflow/compiler/xla/service/computation_placer.cc
+++ b/tensorflow/compiler/xla/service/computation_placer.cc
@@ -32,8 +32,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 Status DeviceAssignment::Serialize(DeviceAssignmentProto* proto) const {
@@ -132,11 +130,9 @@ StatusOr<DeviceAssignment> ComputationPlacer::AssignDevices(
     ComputationPlacer::platform_computation_placer_mutex_(
         tensorflow::LINKER_INITIALIZED);
 
-/* static */ std::map<perftools::gputools::Platform::Id,
-                      ComputationPlacer::State>*
+/* static */ std::map<se::Platform::Id, ComputationPlacer::State>*
 ComputationPlacer::GetPlatformComputationPlacers() {
-  static auto* r =
-      new std::map<perftools::gputools::Platform::Id, ComputationPlacer::State>;
+  static auto* r = new std::map<se::Platform::Id, ComputationPlacer::State>;
   return r;
 }
 
@@ -147,10 +143,10 @@ static std::unique_ptr<xla::ComputationPlacer> CreateComputationPlacer() {
 }
 
 static bool InitModule() {
-  xla::ComputationPlacer::RegisterComputationPlacer(se::host::kHostPlatformId,
-                                                    &CreateComputationPlacer);
-  xla::ComputationPlacer::RegisterComputationPlacer(se::cuda::kCudaPlatformId,
-                                                    &CreateComputationPlacer);
+  xla::ComputationPlacer::RegisterComputationPlacer(
+      stream_executor::host::kHostPlatformId, &CreateComputationPlacer);
+  xla::ComputationPlacer::RegisterComputationPlacer(
+      stream_executor::cuda::kCudaPlatformId, &CreateComputationPlacer);
   return true;
 }
 static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/computation_placer.h b/tensorflow/compiler/xla/service/computation_placer.h
index 737ccabaa7..737d00e93e 100644
--- a/tensorflow/compiler/xla/service/computation_placer.h
+++ b/tensorflow/compiler/xla/service/computation_placer.h
@@ -80,13 +80,13 @@ class ComputationPlacer {
 
   // Registers a computation placer creation function for a particular platform.
   static void RegisterComputationPlacer(
-      perftools::gputools::Platform::Id platform_id,
+      se::Platform::Id platform_id,
       ComputationPlacerCreationFunction creation_function);
 
   // Returns the computation placer singleton pointer if it is available for the
   // given platform, or an error status if it is not.
   static StatusOr<ComputationPlacer*> GetForPlatform(
-      const perftools::gputools::Platform* platform);
+      const se::Platform* platform);
 
  private:
   // The mutex that guards the platform-to-computation placer map.
@@ -101,10 +101,9 @@ class ComputationPlacer {
   };
 
   // Map from platform kind to computation placer singleton.
-  static std::map<perftools::gputools::Platform::Id, State>*
-  GetPlatformComputationPlacers();
+  static std::map<se::Platform::Id, State>* GetPlatformComputationPlacers();
 
-  perftools::gputools::Platform::Id platform_id_;
+  se::Platform::Id platform_id_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(ComputationPlacer);
 };
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index e43777c5e5..e8472fd36b 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -100,8 +100,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace cpu {
 
@@ -440,8 +438,7 @@ Status VerifyLlvmModule(const llvm::Module& llvm_module) {
 }  // namespace
 
 StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
-    std::unique_ptr<HloModule> module,
-    perftools::gputools::StreamExecutor* /*stream_exec*/,
+    std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/,
     DeviceMemoryAllocator* /*device_allocator*/) {
   VLOG(2) << "Before optimization:";
   XLA_VLOG_LINES(2, module->ToString());
@@ -454,8 +451,7 @@ StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
 }
 
 StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
-    std::unique_ptr<HloModule> module,
-    perftools::gputools::StreamExecutor* stream_exec,
+    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     DeviceMemoryAllocator* /*device_allocator*/) {
   const string timer_message =
       "Compiling [" + module->name() + "] for CPU using JIT";
@@ -938,9 +934,9 @@ HloCostAnalysis::ShapeSizeFunction CpuCompiler::ShapeSizeBytesFunction() const {
 }  // namespace xla
 
 static bool InitModule() {
-  xla::Compiler::RegisterCompilerFactory(se::host::kHostPlatformId, []() {
-    return xla::MakeUnique<xla::cpu::CpuCompiler>();
-  });
+  xla::Compiler::RegisterCompilerFactory(
+      stream_executor::host::kHostPlatformId,
+      []() { return xla::MakeUnique<xla::cpu::CpuCompiler>(); });
   return true;
 }
 static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index 3498139ab9..151af38438 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -53,7 +53,7 @@ class CpuAotCompilationOptions : public AotCompilationOptions {
                            RelocationModel relocation_model);
   ~CpuAotCompilationOptions() override;
 
-  perftools::gputools::Platform::Id PlatformId() const override;
+  se::Platform::Id PlatformId() const override;
 
   // The triple used for compilation, similar to clang's -target flag.
   const string& triple() const { return triple_; }
@@ -112,25 +112,23 @@ class CpuCompiler : public LLVMCompiler {
   // Bring in
   // StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
   //     std::vector<std::unique_ptr<HloModule>> modules,
-  //     std::vector<std::vector<perftools::gputools::StreamExecutor*>>
+  //     std::vector<std::vector<se::StreamExecutor*>>
   //        stream_execs)
   using LLVMCompiler::Compile;
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
                      const AotCompilationOptions& options) override;
 
-  perftools::gputools::Platform::Id PlatformId() const override;
+  se::Platform::Id PlatformId() const override;
 
   HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index c053703c35..aee62a4935 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -45,8 +45,6 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/stream_executor/host/host_stream.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace cpu {
 
@@ -75,7 +73,7 @@ CpuExecutable::CpuExecutable(
 
 Status CpuExecutable::AllocateBuffers(
     DeviceMemoryAllocator* memory_allocator, int device_ordinal,
-    std::vector<perftools::gputools::DeviceMemoryBase>* buffers) {
+    std::vector<se::DeviceMemoryBase>* buffers) {
   CHECK_EQ(buffers->size(), assignment_->Allocations().size());
   VLOG(3) << "Allocating " << assignment_->Allocations().size()
           << " allocations for module " << module().name();
@@ -247,8 +245,7 @@ static Status DeallocateTempBuffers(
 
 StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::CreateResultShapedBuffer(
     const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-        allocated_buffers,
+    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
     std::vector<bool>* buffers_in_result) {
   se::Stream* stream = run_options->stream();
   auto result_buffer = MakeUnique<ShapedBuffer>(
@@ -322,7 +319,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteAsyncOnStream(
         "supported on CPU.");
   }
 
-  auto* host_stream = dynamic_cast<perftools::gputools::host::HostStream*>(
+  auto* host_stream = dynamic_cast<se::host::HostStream*>(
       run_options->stream()->implementation());
   se::Stream* stream = run_options->stream();
   DeviceMemoryAllocator* memory_allocator = run_options->allocator();
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index d3502b3a03..c3c2820c26 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -90,17 +90,16 @@ class CpuExecutable : public Executable {
   // assignment. Each vector element corresponds to a particular Index. If
   // a vector element already contains a non-null DeviceMemoryBase, then no
   // buffer is assigned for this element.
-  Status AllocateBuffers(
-      DeviceMemoryAllocator* memory_allocator, int device_ordinal,
-      std::vector<perftools::gputools::DeviceMemoryBase>* buffers);
+  Status AllocateBuffers(DeviceMemoryAllocator* memory_allocator,
+                         int device_ordinal,
+                         std::vector<se::DeviceMemoryBase>* buffers);
 
   // Calls the generated function performing the computation with the given
   // arguments using the supplied buffers.
   Status ExecuteComputeFunction(
       const ExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          buffers,
+      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
       HloExecutionProfile* hlo_execution_profile);
 
   // Create a ShapedBuffer for holding the result of the computation. The
@@ -111,8 +110,7 @@ class CpuExecutable : public Executable {
   // the returned ShapedBuffer).
   StatusOr<std::unique_ptr<ShapedBuffer>> CreateResultShapedBuffer(
       const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          allocated_buffers,
+      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
       std::vector<bool>* buffers_in_result);
 
   // Returns the points-to set of the root instruction of the entry
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
index f5e61aef53..9b39e7f576 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
@@ -34,8 +34,6 @@ limitations under the License.
 #include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 namespace {
@@ -241,21 +239,20 @@ Status CpuTransferManager::TransferLiteralFromOutfeed(
 }
 
 StatusOr<Shape> CpuTransferManager::TransferTupleBuffersFromOutfeed(
-    perftools::gputools::StreamExecutor* executor,
+    se::StreamExecutor* executor,
     tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data) {
   return TransferBuffersFromOutfeedInternal(executor, buffer_data,
                                             /*is_tuple=*/true);
 }
 
 StatusOr<Shape> CpuTransferManager::TransferArrayBufferFromOutfeed(
-    perftools::gputools::StreamExecutor* executor, void* destination,
-    int64 size_bytes) {
+    se::StreamExecutor* executor, void* destination, int64 size_bytes) {
   return TransferBuffersFromOutfeedInternal(
       executor, {{destination, size_bytes}}, /*is_tuple=*/false);
 }
 
 StatusOr<Shape> CpuTransferManager::TransferBuffersFromOutfeedInternal(
-    perftools::gputools::StreamExecutor* executor,
+    se::StreamExecutor* executor,
     tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data,
     bool is_tuple) {
   std::vector<std::unique_ptr<CpuOutfeedBuffer>> buffers;
@@ -306,8 +303,8 @@ static std::unique_ptr<xla::TransferManager> CreateCpuTransferManager() {
 }
 
 static bool InitModule() {
-  xla::TransferManager::RegisterTransferManager(se::host::kHostPlatformId,
-                                                &CreateCpuTransferManager);
+  xla::TransferManager::RegisterTransferManager(
+      stream_executor::host::kHostPlatformId, &CreateCpuTransferManager);
   return true;
 }
 static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
index 6c7524d947..3ecb0d2364 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
@@ -37,36 +37,35 @@ class CpuTransferManager : public GenericTransferManager {
   CpuTransferManager();
   ~CpuTransferManager() override {}
 
-  Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
+  Status TransferLiteralToInfeed(se::StreamExecutor* executor,
                                  const Literal& literal) override;
-  Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
-                                int64 size, const void* source) override;
-  Status TransferLiteralFromOutfeed(
-      perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
-      Literal* literal) override;
+  Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
+                                const void* source) override;
+  Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
+                                    const Shape& literal_shape,
+                                    Literal* literal) override;
 
  private:
   // Transfers infeed data to device. InfeedBuffer->Done() must be
   // called to clean up the memory allocated for InfeedBuffer.
   StatusOr<cpu::runtime::XfeedBuffer*> TransferBufferToInfeedInternal(
-      perftools::gputools::StreamExecutor* executor, int64 size,
-      const void* source);
+      se::StreamExecutor* executor, int64 size, const void* source);
 
   // Helper that transfers a tuple of element buffers from the device's outfeed.
   StatusOr<Shape> TransferTupleBuffersFromOutfeed(
-      perftools::gputools::StreamExecutor* executor,
+      se::StreamExecutor* executor,
       tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data);
 
   // Helper that transfers an array buffer from the device's outfeed.
-  StatusOr<Shape> TransferArrayBufferFromOutfeed(
-      perftools::gputools::StreamExecutor* executor, void* destination,
-      int64 size_bytes);
+  StatusOr<Shape> TransferArrayBufferFromOutfeed(se::StreamExecutor* executor,
+                                                 void* destination,
+                                                 int64 size_bytes);
 
   // On success, returns the shape that was transferred from the outfeed -- if
   // is_tuple is true, the returned shape will be a tuple of the returned shapes
   // for the given buffers.
   StatusOr<Shape> TransferBuffersFromOutfeedInternal(
-      perftools::gputools::StreamExecutor* executor,
+      se::StreamExecutor* executor,
       tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data,
       bool is_tuple);
 
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
index 07a9f0efcb..2d0f1d0be5 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@@ -49,8 +49,6 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace cpu {
 
@@ -325,7 +323,7 @@ const void** Executor::GetOperandBuffers(HloInstruction* instruction) {
 
 Status ParallelCpuExecutable::AllocateBuffers(
     DeviceMemoryAllocator* memory_allocator, int device_ordinal,
-    std::vector<perftools::gputools::DeviceMemoryBase>* buffers) {
+    std::vector<se::DeviceMemoryBase>* buffers) {
   CHECK_EQ(buffers->size(), assignment_->Allocations().size());
   VLOG(3) << "Allocating " << assignment_->Allocations().size()
           << " allocations for module " << module().name();
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
index 87c0a3df45..d87ba57a1e 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
@@ -89,17 +89,16 @@ class ParallelCpuExecutable : public Executable {
   // assignment. Each vector element corresponds to a particular Index. If
   // a vector element already contains a non-null DeviceMemoryBase, then no
   // buffer is assigned for this element.
-  Status AllocateBuffers(
-      DeviceMemoryAllocator* memory_allocator, int device_ordinal,
-      std::vector<perftools::gputools::DeviceMemoryBase>* buffers);
+  Status AllocateBuffers(DeviceMemoryAllocator* memory_allocator,
+                         int device_ordinal,
+                         std::vector<se::DeviceMemoryBase>* buffers);
 
   // Calls the generated functions in 'function_names_', performing the
   // computation with the given arguments using the supplied buffers.
   Status ExecuteComputeFunctions(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          buffers,
+      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
       HloExecutionProfile* hlo_execution_profile);
 
   // Returns the points-to set of the root instruction of the entry
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.cc b/tensorflow/compiler/xla/service/device_memory_allocator.cc
index 78e7aa48ac..35db4fd2a2 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.cc
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.cc
@@ -24,19 +24,16 @@ limitations under the License.
 namespace xla {
 
 StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
-    const perftools::gputools::Platform* platform,
-    tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-        stream_executors)
+    const se::Platform* platform,
+    tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors)
     : DeviceMemoryAllocator(platform),
       stream_executors_(stream_executors.begin(), stream_executors.end()) {}
 
-StatusOr<perftools::gputools::DeviceMemoryBase>
-StreamExecutorMemoryAllocator::Allocate(int device_ordinal, uint64 size,
-                                        bool retry_on_failure) {
-  TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * stream_executor,
+StatusOr<se::DeviceMemoryBase> StreamExecutorMemoryAllocator::Allocate(
+    int device_ordinal, uint64 size, bool retry_on_failure) {
+  TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
                       GetStreamExecutor(device_ordinal));
-  perftools::gputools::DeviceMemoryBase result =
-      stream_executor->AllocateArray<uint8>(size);
+  se::DeviceMemoryBase result = stream_executor->AllocateArray<uint8>(size);
   if (size > 0 && result == nullptr) {
     return ResourceExhausted(
         "Failed to allocate request for %s (%lluB) on device ordinal %d",
@@ -47,22 +44,22 @@ StreamExecutorMemoryAllocator::Allocate(int device_ordinal, uint64 size,
 }
 
 tensorflow::Status StreamExecutorMemoryAllocator::Deallocate(
-    int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) {
+    int device_ordinal, se::DeviceMemoryBase* mem) {
   if (!mem->is_null()) {
-    TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * stream_executor,
+    TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
                         GetStreamExecutor(device_ordinal));
     // We make a local copy of 'mem' so the original is not zeroed out by the
     // Deallocate() call below. This gives us a better chance of
     // catching double-free bugs, since Deallocate silently succeeds for null
     // values.
-    perftools::gputools::DeviceMemoryBase mem_copy(*mem);
+    se::DeviceMemoryBase mem_copy(*mem);
     stream_executor->Deallocate(&mem_copy);
   }
   return tensorflow::Status::OK();
 }
 
-StatusOr<perftools::gputools::StreamExecutor*>
-StreamExecutorMemoryAllocator::GetStreamExecutor(int device_ordinal) {
+StatusOr<se::StreamExecutor*> StreamExecutorMemoryAllocator::GetStreamExecutor(
+    int device_ordinal) {
   if (device_ordinal < 0) {
     return InvalidArgument("device ordinal value (%d) must be non-negative",
                            device_ordinal);
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/compiler/xla/service/device_memory_allocator.h
index 39dfad84c1..240acf8973 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.h
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.h
@@ -33,7 +33,7 @@ class DeviceMemoryAllocator {
  public:
   // Parameter platform indicates which platform the allocator allocates memory
   // on. Must be non-null.
-  explicit DeviceMemoryAllocator(const perftools::gputools::Platform* platform)
+  explicit DeviceMemoryAllocator(const se::Platform* platform)
       : platform_(platform) {}
   virtual ~DeviceMemoryAllocator() {}
 
@@ -43,20 +43,20 @@ class DeviceMemoryAllocator {
   // has only performance impact.
   // Allocate() should return a null pointer for a size-0 allocation.
   // Deallocate() must be a no-op for null pointers.
-  virtual StatusOr<perftools::gputools::DeviceMemoryBase> Allocate(
+  virtual StatusOr<se::DeviceMemoryBase> Allocate(
       int device_ordinal, uint64 size, bool retry_on_failure = true) = 0;
-  virtual tensorflow::Status Deallocate(
-      int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) = 0;
+  virtual tensorflow::Status Deallocate(int device_ordinal,
+                                        se::DeviceMemoryBase* mem) = 0;
 
   // Return the platform that the allocator allocates memory on.
-  const perftools::gputools::Platform* platform() const { return platform_; }
+  const se::Platform* platform() const { return platform_; }
 
   // Can we call Deallocate() as soon as a computation has been scheduled on
   // a stream, or do we have to wait for the computation to complete first?
   virtual bool AllowsAsynchronousDeallocation() const = 0;
 
  protected:
-  const perftools::gputools::Platform* platform_;
+  const se::Platform* platform_;
 };
 
 // Default memory allocator for a platform which uses
@@ -64,25 +64,23 @@ class DeviceMemoryAllocator {
 class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
  public:
   StreamExecutorMemoryAllocator(
-      const perftools::gputools::Platform* platform,
-      tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-          stream_executors);
+      const se::Platform* platform,
+      tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors);
 
-  StatusOr<perftools::gputools::DeviceMemoryBase> Allocate(
+  StatusOr<se::DeviceMemoryBase> Allocate(
       int device_ordinal, uint64 size, bool retry_on_failure = true) override;
-  tensorflow::Status Deallocate(
-      int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) override;
+  tensorflow::Status Deallocate(int device_ordinal,
+                                se::DeviceMemoryBase* mem) override;
 
   bool AllowsAsynchronousDeallocation() const override;
 
  private:
-  StatusOr<perftools::gputools::StreamExecutor*> GetStreamExecutor(
-      int device_ordinal);
+  StatusOr<se::StreamExecutor*> GetStreamExecutor(int device_ordinal);
 
   // A vector indexed by device ordinal of StreamExecutors for each device of
   // the allocator's platform type. If an element is nullptr, then the device
   // with the respective device ordinal is not supported by XLA.
-  std::vector<perftools::gputools::StreamExecutor*> stream_executors_;
+  std::vector<se::StreamExecutor*> stream_executors_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 471d2fd6ce..caa46686be 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -61,10 +61,10 @@ Executable::ExecuteOnStreams(
 StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper(
     const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile,
     ArraySlice<const ShapedBuffer*> arguments) {
-  perftools::gputools::Stream* stream = run_options->stream();
-  std::unique_ptr<perftools::gputools::Timer> timer;
+  se::Stream* stream = run_options->stream();
+  std::unique_ptr<se::Timer> timer;
   if (profile != nullptr) {
-    timer.reset(new perftools::gputools::Timer(stream->parent()));
+    timer.reset(new se::Timer(stream->parent()));
     stream->InitTimer(timer.get()).ThenStartTimer(timer.get());
   }
 
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index a157235f8a..6f4cd99767 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -90,7 +90,7 @@ class Executable {
   // has completed.
   virtual Status PopulateExecutionProfile(
       HloExecutionProfile* hlo_execution_profile,
-      perftools::gputools::StreamExecutor* executor) {
+      se::StreamExecutor* executor) {
     return Status::OK();
   }
 
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index a99e2b7794..ddb687314e 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -32,8 +32,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 GenericTransferManager::GenericTransferManager(se::Platform::Id platform_id,
@@ -45,9 +43,9 @@ se::Platform::Id GenericTransferManager::PlatformId() const {
 }
 
 Status GenericTransferManager::WriteSingleTupleIndexTable(
-    perftools::gputools::StreamExecutor* executor,
+    se::StreamExecutor* executor,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> elements,
-    const Shape& shape, perftools::gputools::DeviceMemoryBase* region) {
+    const Shape& shape, se::DeviceMemoryBase* region) {
   TF_RET_CHECK(elements.size() == ShapeUtil::TupleElementCount(shape));
 
   std::vector<const void*> element_pointers;
@@ -144,20 +142,19 @@ Status GenericTransferManager::TransferLiteralToInfeed(
 }
 
 Status GenericTransferManager::TransferBufferToInfeed(
-    perftools::gputools::StreamExecutor* executor, int64 size,
-    const void* source) {
+    se::StreamExecutor* executor, int64 size, const void* source) {
   return Unimplemented("Generic transfer to Infeed");
 }
 
 Status GenericTransferManager::TransferLiteralFromOutfeed(
-    perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
+    se::StreamExecutor* executor, const Shape& literal_shape,
     Literal* literal) {
   return Unimplemented(
       "Outfeed is not supported on this platform (b/30467474)");
 }
 
 Status GenericTransferManager::ResetDevices(
-    tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
+    tensorflow::gtl::ArraySlice<se::StreamExecutor*>
     /*executors*/) {
   return Unimplemented(
       "Device reset is not yet supported on this platform (b/30481585)");
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h
index 63a7c820cf..0579099de4 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h
@@ -36,46 +36,41 @@ namespace xla {
 // infeed.
 class GenericTransferManager : public TransferManager {
  public:
-  GenericTransferManager(perftools::gputools::Platform::Id platform_id,
-                         size_t pointer_size);
+  GenericTransferManager(se::Platform::Id platform_id, size_t pointer_size);
   ~GenericTransferManager() override {}
 
-  perftools::gputools::Platform::Id PlatformId() const override;
+  se::Platform::Id PlatformId() const override;
 
   StatusOr<std::unique_ptr<Literal>> TransferLiteralFromDevice(
-      perftools::gputools::StreamExecutor* executor,
-      const ShapedBuffer& device_buffer) override;
+      se::StreamExecutor* executor, const ShapedBuffer& device_buffer) override;
 
-  Status TransferLiteralToDevice(perftools::gputools::StreamExecutor* executor,
+  Status TransferLiteralToDevice(se::StreamExecutor* executor,
                                  const Literal& literal,
                                  const ShapedBuffer& device_buffer) override;
 
-  Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
+  Status TransferLiteralToInfeed(se::StreamExecutor* executor,
                                  const Literal& literal) override;
-  Status TransferLiteralFromOutfeed(
-      perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
-      Literal* literal) override;
+  Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
+                                    const Shape& literal_shape,
+                                    Literal* literal) override;
 
   Status ResetDevices(
-      tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-          executors) override;
+      tensorflow::gtl::ArraySlice<se::StreamExecutor*> executors) override;
 
   int64 GetByteSizeRequirement(const Shape& shape) const override;
 
  protected:
-  Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
-                                int64 size, const void* source) override;
+  Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
+                                const void* source) override;
 
   Status WriteSingleTupleIndexTable(
-      perftools::gputools::StreamExecutor* executor,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          elements,
-      const Shape& shape,
-      perftools::gputools::DeviceMemoryBase* region) override;
+      se::StreamExecutor* executor,
+      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> elements,
+      const Shape& shape, se::DeviceMemoryBase* region) override;
 
  private:
   // The platform this transfer manager targets.
-  const perftools::gputools::Platform::Id platform_id_;
+  const se::Platform::Id platform_id_;
 
   // The size in bytes of pointers on this platform.
   const size_t pointer_size_;
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
index 2029c303d4..837f05244f 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
@@ -28,8 +28,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
index ea7f0eb374..c2fc35be4c 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
@@ -41,7 +41,7 @@ class BufferAllocations {
     // user-specified result buffers) to the given buffer index. The builder
     // will skip allocating buffers for registered buffer indices.
     void RegisterBuffer(BufferAllocation::Index index,
-                        perftools::gputools::DeviceMemoryBase address);
+                        se::DeviceMemoryBase address);
 
     // Builds a BufferAllocations object from the given buffer assignment.
     // `memory_allocator` is what this function uses to allocate device memory.
@@ -52,8 +52,7 @@ class BufferAllocations {
         DeviceMemoryAllocator* memory_allocator);
 
    private:
-    std::map<BufferAllocation::Index, perftools::gputools::DeviceMemoryBase>
-        registered_buffers_;
+    std::map<BufferAllocation::Index, se::DeviceMemoryBase> registered_buffers_;
   };
 
   BufferAllocations(const BufferAllocations&) = delete;
@@ -65,22 +64,20 @@ class BufferAllocations {
   // Returns the device address of buffer `buffer_index`. `buffer_index` must be
   // a valid index, i.e., in [0, buffer_count). This function returns null if
   // `buffer_index` is not assigned to a buffer address.
-  perftools::gputools::DeviceMemoryBase GetDeviceAddress(
+  se::DeviceMemoryBase GetDeviceAddress(
       BufferAllocation::Index buffer_index) const;
 
   // Same as above, but also adjusts the returned address for the offset and
   // size contained in the given slice.
-  perftools::gputools::DeviceMemoryBase GetDeviceAddress(
+  se::DeviceMemoryBase GetDeviceAddress(
       const BufferAllocation::Slice& buffer_slice) const;
 
-  perftools::gputools::DeviceMemoryBase GetTempBufferBase() const {
-    return temp_buffer_base_;
-  }
+  se::DeviceMemoryBase GetTempBufferBase() const { return temp_buffer_base_; }
 
   // Tears down all buffers allocated by this object that are not in
   // `live_addresses`.
   tensorflow::Status TearDown(
-      const std::set<perftools::gputools::DeviceMemoryBase>& live_addresses,
+      const std::set<se::DeviceMemoryBase>& live_addresses,
       const BufferAssignment& buffer_assignment);
 
  private:
@@ -92,15 +89,15 @@ class BufferAllocations {
 
   // Sets the device address of buffer `buffer_index`.
   void SetBuffer(BufferAllocation::Index buffer_index,
-                 perftools::gputools::DeviceMemoryBase buffer);
+                 se::DeviceMemoryBase buffer);
 
   // An array of device pointers that stores the address of each buffer
   // indexed by Index. Each element can point to a temporary buffer, an
   // input buffer, or nullptr if no buffer is needed for that Index.
-  std::vector<perftools::gputools::DeviceMemoryBase> buffers_;
+  std::vector<se::DeviceMemoryBase> buffers_;
 
   // The base address of the memory block that contains all temporary buffers.
-  perftools::gputools::DeviceMemoryBase temp_buffer_base_;
+  se::DeviceMemoryBase temp_buffer_base_;
 
   int device_ordinal_;
 
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
index 790ca535b1..dce8de2e30 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@@ -42,11 +42,10 @@ Status ConditionalThunk::Initialize(const GpuExecutable& executable) {
 }
 
 Status ConditionalThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
   // Copy the predicate value from device.
   bool predicate;
-  perftools::gputools::DeviceMemoryBase predicate_address =
+  se::DeviceMemoryBase predicate_address =
       buffer_allocations.GetDeviceAddress(predicate_buffer_index_);
   stream->ThenMemcpy(&predicate, predicate_address, sizeof(bool));
 
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
index 7725c46a3b..e40872688f 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
@@ -49,7 +49,7 @@ class ConditionalThunk : public Thunk {
 
   Status Initialize(const GpuExecutable& executable) override;
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   BufferAllocation::Slice predicate_buffer_index_;
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index 461747b699..64d3b84b8c 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -25,8 +25,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
index 900d9cb624..6d845025b1 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
@@ -66,23 +66,21 @@ class ConvolutionThunk : public Thunk {
 
   // Does the convolution for the thunk on "stream".
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   class ScratchAllocator;
 
-  Status Convolve(
-      const perftools::gputools::dnn::BatchDescriptor& input_descriptor,
-      perftools::gputools::DeviceMemory<float> input_data,
-      const perftools::gputools::dnn::FilterDescriptor& filter_descriptor,
-      perftools::gputools::DeviceMemory<float> filter_data,
-      const perftools::gputools::dnn::BatchDescriptor& output_descriptor,
-      perftools::gputools::DeviceMemory<float> output_data,
-      const perftools::gputools::dnn::ConvolutionDescriptor&
-          convolution_descriptor,
-      const perftools::gputools::dnn::AlgorithmConfig& algorithm_config,
-      perftools::gputools::Stream* stream, ScratchAllocator* scratch_allocator,
-      perftools::gputools::dnn::ProfileResult* profile_result);
+  Status Convolve(const se::dnn::BatchDescriptor& input_descriptor,
+                  se::DeviceMemory<float> input_data,
+                  const se::dnn::FilterDescriptor& filter_descriptor,
+                  se::DeviceMemory<float> filter_data,
+                  const se::dnn::BatchDescriptor& output_descriptor,
+                  se::DeviceMemory<float> output_data,
+                  const se::dnn::ConvolutionDescriptor& convolution_descriptor,
+                  const se::dnn::AlgorithmConfig& algorithm_config,
+                  se::Stream* stream, ScratchAllocator* scratch_allocator,
+                  se::dnn::ProfileResult* profile_result);
 
   const CudnnConvKind convolution_kind_;
 
diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
index f4498663b1..bf912fbd14 100644
--- a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
@@ -30,9 +30,8 @@ HostToDeviceCopyThunk::HostToDeviceCopyThunk(
       mem_size_(mem_size) {}
 
 tensorflow::Status HostToDeviceCopyThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
-  perftools::gputools::DeviceMemoryBase destination_data =
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+  se::DeviceMemoryBase destination_data =
       buffer_allocations.GetDeviceAddress(destination_buffer_);
   stream->ThenMemcpy(&destination_data, source_address_, mem_size_);
   return tensorflow::Status::OK();
@@ -48,11 +47,10 @@ DeviceToDeviceCopyThunk::DeviceToDeviceCopyThunk(
       mem_size_(mem_size) {}
 
 tensorflow::Status DeviceToDeviceCopyThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
-  perftools::gputools::DeviceMemoryBase destination_data =
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+  se::DeviceMemoryBase destination_data =
       buffer_allocations.GetDeviceAddress(destination_buffer_);
-  perftools::gputools::DeviceMemoryBase source_data =
+  se::DeviceMemoryBase source_data =
       buffer_allocations.GetDeviceAddress(source_buffer_);
   stream->ThenMemcpy(&destination_data, source_data, mem_size_);
   return tensorflow::Status::OK();
diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.h b/tensorflow/compiler/xla/service/gpu/copy_thunk.h
index e2783fd255..2e7eb5f344 100644
--- a/tensorflow/compiler/xla/service/gpu/copy_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.h
@@ -40,8 +40,7 @@ class HostToDeviceCopyThunk : public Thunk {
   HostToDeviceCopyThunk& operator=(const HostToDeviceCopyThunk&) = delete;
 
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
   const void* source_address_;
@@ -64,8 +63,7 @@ class DeviceToDeviceCopyThunk : public Thunk {
   DeviceToDeviceCopyThunk& operator=(const DeviceToDeviceCopyThunk&) = delete;
 
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
   const BufferAllocation::Slice source_buffer_;
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
index 58d9c8caff..68099fd638 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
@@ -28,7 +28,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-namespace se = ::perftools::gputools;
 namespace dnn = se::dnn;
 
 static std::pair<dnn::BatchDescriptor /*input_desc*/,
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
index c5fbb6d8a3..874f85a863 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
@@ -60,7 +60,7 @@ class CudnnBatchNormForwardInferenceThunk : public Thunk {
       const CudnnBatchNormForwardInferenceThunk&) = delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   BufferAllocation::Slice operand_;
@@ -90,7 +90,7 @@ class CudnnBatchNormForwardTrainingThunk : public Thunk {
       const CudnnBatchNormForwardTrainingThunk&) = delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   BufferAllocation::Slice operand_;
@@ -123,7 +123,7 @@ class CudnnBatchNormBackwardThunk : public Thunk {
       delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   BufferAllocation::Slice operand_;
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
index d6b457a91b..1790c50d4d 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -24,8 +24,6 @@ namespace xla {
 namespace gpu {
 namespace {
 
-namespace se = perftools::gputools;
-
 using se::DeviceMemoryBase;
 using se::dnn::AlgorithmConfig;
 using se::dnn::AlgorithmDesc;
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
index 516210ec2e..bc5d1ce94a 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
@@ -33,9 +33,8 @@ class CudnnConvolutionAlgorithmPicker : public HloPassInterface {
   // If the `allocator` parameter is not null, we will use it to allocate temp
   // memory while timing the various convolution algorithms.  If it's null,
   // we'll use the default allocator on the StreamExecutor.
-  CudnnConvolutionAlgorithmPicker(
-      perftools::gputools::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* allocator)
+  CudnnConvolutionAlgorithmPicker(se::StreamExecutor* stream_exec,
+                                  DeviceMemoryAllocator* allocator)
       : stream_exec_(stream_exec), allocator_(allocator) {}
 
   tensorflow::StringPiece name() const override {
@@ -52,7 +51,7 @@ class CudnnConvolutionAlgorithmPicker : public HloPassInterface {
       const Shape& output_shape, const Window& window,
       const ConvolutionDimensionNumbers& dnums, HloInstruction* instr);
 
-  perftools::gputools::StreamExecutor* stream_exec_;  // never null
+  se::StreamExecutor* stream_exec_;                   // never null
   DeviceMemoryAllocator* allocator_;                  // may be null
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
index e4ae839e1d..10b4c3de89 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
@@ -22,8 +22,6 @@ namespace xla {
 namespace gpu {
 namespace {
 
-namespace se = ::perftools::gputools;
-
 using se::DeviceMemory;
 using se::DeviceMemoryBase;
 using se::Stream;
@@ -215,14 +213,12 @@ string CudnnConvKindToString(CudnnConvKind kind) {
 
 Status RunCudnnConvolution(
     CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
-    const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
-    perftools::gputools::DeviceMemoryBase filter_buf,
-    perftools::gputools::DeviceMemoryBase output_buf,
-    perftools::gputools::DeviceMemoryBase scratch_buf, const Window& window,
+    const Shape& output_shape, se::DeviceMemoryBase input_buf,
+    se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
+    se::DeviceMemoryBase scratch_buf, const Window& window,
     const ConvolutionDimensionNumbers& dnums,
-    perftools::gputools::dnn::AlgorithmConfig algorithm,
-    perftools::gputools::Stream* stream,
-    perftools::gputools::dnn::ProfileResult* profile_result) {
+    se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
+    se::dnn::ProfileResult* profile_result) {
   ScratchBufAllocator scratch_allocator(scratch_buf);
   return RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
                              input_buf, filter_buf, output_buf,
@@ -232,14 +228,12 @@ Status RunCudnnConvolution(
 
 Status RunCudnnConvolution(
     CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
-    const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
-    perftools::gputools::DeviceMemoryBase filter_buf,
-    perftools::gputools::DeviceMemoryBase output_buf,
-    perftools::gputools::ScratchAllocator* scratch_allocator,
-    const Window& window, const ConvolutionDimensionNumbers& dnums,
-    perftools::gputools::dnn::AlgorithmConfig algorithm,
-    perftools::gputools::Stream* stream,
-    perftools::gputools::dnn::ProfileResult* profile_result) {
+    const Shape& output_shape, se::DeviceMemoryBase input_buf,
+    se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
+    se::ScratchAllocator* scratch_allocator, const Window& window,
+    const ConvolutionDimensionNumbers& dnums,
+    se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
+    se::dnn::ProfileResult* profile_result) {
   PrimitiveType output_primitive_type = output_shape.element_type();
   CHECK(output_primitive_type == F32 || output_primitive_type == F16)
       << ShapeUtil::HumanString(output_shape);
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h
index 3dbfa2730d..944e4ac686 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h
@@ -72,25 +72,21 @@ string CudnnConvKindToString(CudnnConvKind kind);
 // that size, if you like.
 Status RunCudnnConvolution(
     CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
-    const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
-    perftools::gputools::DeviceMemoryBase filter_buf,
-    perftools::gputools::DeviceMemoryBase output_buf,
-    perftools::gputools::DeviceMemoryBase scratch_buf, const Window& window,
+    const Shape& output_shape, se::DeviceMemoryBase input_buf,
+    se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
+    se::DeviceMemoryBase scratch_buf, const Window& window,
     const ConvolutionDimensionNumbers& dnums,
-    perftools::gputools::dnn::AlgorithmConfig algorithm,
-    perftools::gputools::Stream* stream,
-    perftools::gputools::dnn::ProfileResult* profile_result = nullptr);
+    se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
+    se::dnn::ProfileResult* profile_result = nullptr);
 
 Status RunCudnnConvolution(
     CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
-    const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
-    perftools::gputools::DeviceMemoryBase filter_buf,
-    perftools::gputools::DeviceMemoryBase output_buf,
-    perftools::gputools::ScratchAllocator* scratch_allocator,
-    const Window& window, const ConvolutionDimensionNumbers& dnums,
-    perftools::gputools::dnn::AlgorithmConfig algorithm,
-    perftools::gputools::Stream* stream,
-    perftools::gputools::dnn::ProfileResult* profile_result = nullptr);
+    const Shape& output_shape, se::DeviceMemoryBase input_buf,
+    se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
+    se::ScratchAllocator* scratch_allocator, const Window& window,
+    const ConvolutionDimensionNumbers& dnums,
+    se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
+    se::dnn::ProfileResult* profile_result = nullptr);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
index 66931bdc8b..cc747addbd 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
@@ -24,8 +24,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.h b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
index 52fb8c376d..24b1dca998 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
@@ -34,24 +34,24 @@ namespace gpu {
 // released on destruction.
 //
 // Not thread-safe in that AllocateBytes, destructor are not locked.
-class FftScratchAllocator : public perftools::gputools::ScratchAllocator {
+class FftScratchAllocator : public se::ScratchAllocator {
  public:
   FftScratchAllocator(int device_ordinal,
                       DeviceMemoryAllocator* memory_allocator);
 
   ~FftScratchAllocator() override;
 
-  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override;
+  int64 GetMemoryLimitInBytes(se::Stream* stream) override;
 
   int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
 
-  perftools::gputools::port::StatusOr<perftools::gputools::DeviceMemory<uint8>>
-  AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override;
+  se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
+      se::Stream* stream, int64 byte_size) override;
 
  private:
   const int device_ordinal_;
   DeviceMemoryAllocator* memory_allocator_;
-  std::vector<perftools::gputools::DeviceMemoryBase> allocated_buffers_;
+  std::vector<se::DeviceMemoryBase> allocated_buffers_;
   int64 total_allocated_bytes_ = 0;
 };
 
@@ -74,16 +74,15 @@ class FftThunk : public Thunk {
 
   // Does the FFT for the thunk on "stream".
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
-  const perftools::gputools::fft::Type fft_type_;
+  const se::fft::Type fft_type_;
   const std::vector<int64> fft_length_;
 
   float scale_factor_;
 
-  std::unique_ptr<perftools::gputools::fft::Plan> fft_plan_;
+  std::unique_ptr<se::fft::Plan> fft_plan_;
 
   const BufferAllocation::Slice input_buffer_;
   const BufferAllocation::Slice output_buffer_;
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.cc b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
index 283d21ca22..6e6966df39 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
@@ -36,8 +36,7 @@ tensorflow::Status ForThunk::Initialize(const GpuExecutable& executable) {
 }
 
 tensorflow::Status ForThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
   for (int64 i = 0; i < loop_limit_; ++i) {
     // Invoke loop body thunk sequence.
     TF_RETURN_IF_ERROR(
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.h b/tensorflow/compiler/xla/service/gpu/for_thunk.h
index 832494d17e..c78d1c5068 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h
@@ -38,8 +38,7 @@ class ForThunk : public Thunk {
 
   tensorflow::Status Initialize(const GpuExecutable& executable) override;
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
   const int64 loop_limit_;
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
index 38668ff455..0ec12f52d8 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
@@ -22,8 +22,6 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
index df3edcefef..a18f425bc3 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
@@ -50,14 +50,12 @@ class GemmThunk : public Thunk {
 
   // Does the gemm operation for the thunk on "stream", which must be non-null.
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
   // Returns true if we'll perform autotuning if run on the given stream.  If
   // so, we want the GPU to be quiescent during autotuning, so as not to
   // introduce noise in our results.
-  bool ShouldHaltAllActivityBeforeRunning(
-      perftools::gputools::Stream* stream) override {
+  bool ShouldHaltAllActivityBeforeRunning(se::Stream* stream) override {
     return autotune_results_.count(
                stream->parent()->GetDeviceDescription().name()) != 0;
   }
@@ -79,8 +77,7 @@ class GemmThunk : public Thunk {
   // results.  The map's value is the best algorithm we've found for this thunk
   // on this device, or an error if none of the algorithms worked and we should
   // use the regular gemm without an algorithm.
-  std::unordered_map<string,
-                     StatusOr<::perftools::gputools::blas::AlgorithmType>>
+  std::unordered_map<string, StatusOr<se::blas::AlgorithmType>>
       autotune_results_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 07be2a0cf9..30bfc9351a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -91,8 +91,6 @@ limitations under the License.
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
@@ -779,9 +777,9 @@ se::Platform::Id GpuCompiler::PlatformId() const {
 }  // namespace xla
 
 static bool InitModule() {
-  xla::Compiler::RegisterCompilerFactory(se::cuda::kCudaPlatformId, []() {
-    return xla::MakeUnique<xla::gpu::GpuCompiler>();
-  });
+  xla::Compiler::RegisterCompilerFactory(
+      stream_executor::cuda::kCudaPlatformId,
+      []() { return xla::MakeUnique<xla::gpu::GpuCompiler>(); });
   return true;
 }
 static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index c352d4d846..f3b02ae5d8 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -45,25 +45,23 @@ class GpuCompiler : public LLVMCompiler {
   // Bring in
   // StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
   //     std::vector<std::unique_ptr<HloModule>> modules,
-  //     std::vector<std::vector<perftools::gputools::StreamExecutor*>>
+  //     std::vector<std::vector<se::StreamExecutor*>>
   //        stream_execs)
   using LLVMCompiler::Compile;
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> module,
                      AotCompilationOptions const& options) override;
 
-  perftools::gputools::Platform::Id PlatformId() const override;
+  se::Platform::Id PlatformId() const override;
 
   HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override {
     // Capture just the pointer size, not the entire GpuCompiler object.
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 28f9344795..5676d4de8e 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -34,8 +34,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 namespace {
@@ -324,7 +322,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
             this->assignment_->GetUniqueSlice(src_hlo, sources[0]->index()));
         CHECK(!slice.allocation()->is_entry_computation_parameter());
 
-        perftools::gputools::DeviceMemoryBase src_base =
+        se::DeviceMemoryBase src_base =
             buffer_allocations->GetDeviceAddress(slice.index());
         CHECK(!src_base.is_null() || src_base.size() == 0);
         *device_memory = src_base;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
index af9897769f..f13727ca9b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -33,8 +33,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 // TODO(b/30467474) Once GPU infeed implementation settles, consider
@@ -153,8 +151,8 @@ static std::unique_ptr<xla::TransferManager> CreateGpuTransferManager() {
 }
 
 static bool InitModule() {
-  xla::TransferManager::RegisterTransferManager(se::cuda::kCudaPlatformId,
-                                                &CreateGpuTransferManager);
+  xla::TransferManager::RegisterTransferManager(
+      stream_executor::cuda::kCudaPlatformId, &CreateGpuTransferManager);
   return true;
 }
 static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
index 9aa369c668..d040a99975 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
@@ -36,21 +36,20 @@ class GpuTransferManager : public GenericTransferManager {
   GpuTransferManager();
   ~GpuTransferManager() override {}
 
-  Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
+  Status TransferLiteralToInfeed(se::StreamExecutor* executor,
                                  const Literal& literal) override;
-  Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
-                                int64 size, const void* source) override;
+  Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
+                                const void* source) override;
 
  private:
   // Initiates the infeed data transfers. InfeedBuffer->Done() must be
   // called to clean up the memory allocated for InfeedBuffer.
   StatusOr<gpu::InfeedBuffer*> TransferBufferToInfeedInternal(
-      perftools::gputools::StreamExecutor* executor, int64 size,
-      const void* source);
+      se::StreamExecutor* executor, int64 size, const void* source);
 
   // Enqueues infeed data buffers with the infeed manager after their
   // transfer completes.
-  Status EnqueueBuffersToInfeed(perftools::gputools::StreamExecutor* executor,
+  Status EnqueueBuffersToInfeed(se::StreamExecutor* executor,
                                 std::vector<gpu::InfeedBuffer*> buffers);
 
   TF_DISALLOW_COPY_AND_ASSIGN(GpuTransferManager);
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
index ee5b447c9c..3ddc1c0789 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
@@ -19,8 +19,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/core/platform/logging.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.h b/tensorflow/compiler/xla/service/gpu/infeed_manager.h
index 73d5a5ce35..d5f2216d46 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_manager.h
+++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.h
@@ -46,7 +46,7 @@ namespace gpu {
 // the client. The client manages the memory of the buffer.
 class InfeedBuffer {
  public:
-  InfeedBuffer(perftools::gputools::StreamExecutor* executor, int64 length)
+  InfeedBuffer(se::StreamExecutor* executor, int64 length)
       : executor_(executor), length_(length) {
     device_memory_ = executor_->AllocateArray<uint8>(length);
     CHECK(!device_memory_.is_null());
@@ -60,14 +60,12 @@ class InfeedBuffer {
   // client to manage memory for the infeed buffers.
   void Done() { delete this; }
 
-  perftools::gputools::DeviceMemoryBase* device_memory() {
-    return &device_memory_;
-  }
+  se::DeviceMemoryBase* device_memory() { return &device_memory_; }
 
  private:
-  perftools::gputools::StreamExecutor* executor_;  // Not owned.
+  se::StreamExecutor* executor_;  // Not owned.
   const int64 length_;
-  perftools::gputools::DeviceMemoryBase device_memory_;
+  se::DeviceMemoryBase device_memory_;
 };
 
 // Client-side class used to enqueue infeed buffers.
@@ -100,8 +98,7 @@ class InfeedManager {
   // new stream on the first invocation. On subsequent invocations, if
   // the cached executor is not the same as the requested executor,
   // returns null.
-  perftools::gputools::Stream* GetStream(
-      perftools::gputools::StreamExecutor* executor);
+  se::Stream* GetStream(se::StreamExecutor* executor);
 
  private:
   // TODO(b/30467474): Revisit if this mutex becomes a point of
@@ -121,10 +118,10 @@ class InfeedManager {
   tensorflow::gtl::FlatSet<const InfeedBuffer*> dequeued_buffer_;
 
   // Cached host to device stream for queuing infeed data.
-  std::unique_ptr<perftools::gputools::Stream> host_to_device_stream_;
+  std::unique_ptr<se::Stream> host_to_device_stream_;
 
   // Executor that the host_to_device_stream belongs to. Not owned.
-  perftools::gputools::StreamExecutor* host_to_device_executor_;
+  se::StreamExecutor* host_to_device_executor_;
 };
 
 // Singleton creator-or-accessor: Returns the GPU infeed manager.
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
index 2ac95ceb69..ea34d5b30c 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
@@ -31,10 +31,10 @@ InfeedThunk::InfeedThunk(
       destination_buffer_(destination_buffer) {}
 
 Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                                    perftools::gputools::Stream* stream) {
+                                    se::Stream* stream) {
   VLOG(2) << "Infeeding to GPU ";
 
-  perftools::gputools::DeviceMemoryBase destination_address =
+  se::DeviceMemoryBase destination_address =
       buffer_allocations.GetDeviceAddress(destination_buffer_);
 
   InfeedManager* infeed_manager = GetOrCreateInfeedManager();
@@ -45,7 +45,7 @@ Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
     std::vector<void*> tuple_element_addresses;
     for (BufferAllocation::Slice tuple_element_buffer :
          tuple_element_buffers_) {
-      perftools::gputools::DeviceMemoryBase tuple_element_address =
+      se::DeviceMemoryBase tuple_element_address =
           buffer_allocations.GetDeviceAddress(tuple_element_buffer);
 
       InfeedBuffer* buffer = infeed_manager->BlockingDequeueBuffer();
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
index 86918705fa..93713cb12d 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
@@ -44,7 +44,7 @@ class InfeedThunk : public Thunk {
   InfeedThunk& operator=(const InfeedThunk&) = delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   const std::vector<BufferAllocation::Slice> tuple_element_buffers_;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
index 3790ed313b..a78b4ff830 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
@@ -32,7 +32,7 @@ class IrEmitterContext {
  public:
   IrEmitterContext(const HloModule* hlo_module,
                    const BufferAssignment* buffer_assignment,
-                   const perftools::gputools::DeviceDescription* device_desc,
+                   const se::DeviceDescription* device_desc,
                    llvm::Module* llvm_module)
       : hlo_module_(hlo_module),
         buffer_assignment_(buffer_assignment),
@@ -47,7 +47,7 @@ class IrEmitterContext {
   const BufferAssignment& buffer_assignment() const {
     return *buffer_assignment_;
   }
-  const perftools::gputools::DeviceDescription& device_description() const {
+  const se::DeviceDescription& device_description() const {
     return *device_desc_;
   }
   llvm::Module* llvm_module() { return llvm_module_; }
@@ -56,7 +56,7 @@ class IrEmitterContext {
  private:
   const HloModule* hlo_module_;
   const BufferAssignment* buffer_assignment_;
-  const perftools::gputools::DeviceDescription* device_desc_;
+  const se::DeviceDescription* device_desc_;
   llvm::Module* llvm_module_;
   NameUniquer name_uniquer_;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index c24dc1457f..d376ef7a24 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -23,8 +23,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index df8971b083..b556befe66 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -61,8 +61,7 @@ class KernelThunk : public Thunk {
 
   // Executes the kernel for the thunk on "stream", which must be non-null.
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
   // Buffers passed to the kernel as arguments.
@@ -82,13 +81,11 @@ class KernelThunk : public Thunk {
   // Describes how to load this kernel. ExecuteOnStream reuses this loader
   // specification for all executions.
   mutable tensorflow::mutex mutex_;
-  std::unique_ptr<perftools::gputools::MultiKernelLoaderSpec> loader_spec_
-      GUARDED_BY(mutex_);
+  std::unique_ptr<se::MultiKernelLoaderSpec> loader_spec_ GUARDED_BY(mutex_);
 
   // Loaded kernels for each `StreamExecutor`
-  std::unordered_map<perftools::gputools::StreamExecutor*,
-                     perftools::gputools::KernelBase>
-      kernel_cache_ GUARDED_BY(mutex_);
+  std::unordered_map<se::StreamExecutor*, se::KernelBase> kernel_cache_
+      GUARDED_BY(mutex_);
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/memset_thunk.cc b/tensorflow/compiler/xla/service/gpu/memset_thunk.cc
index 18e673542c..d4100a898b 100644
--- a/tensorflow/compiler/xla/service/gpu/memset_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/memset_thunk.cc
@@ -19,8 +19,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-namespace se = ::perftools::gputools;
-
 Status MemzeroThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream) {
   se::DeviceMemoryBase dest_data = buffer_allocations.GetDeviceAddress(dest_);
diff --git a/tensorflow/compiler/xla/service/gpu/memset_thunk.h b/tensorflow/compiler/xla/service/gpu/memset_thunk.h
index b4bb74d1dd..51c332d287 100644
--- a/tensorflow/compiler/xla/service/gpu/memset_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/memset_thunk.h
@@ -36,7 +36,7 @@ class MemzeroThunk : public Thunk {
       : Thunk(Kind::kMemzero, hlo), dest_(dest) {}
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   const BufferAllocation::Slice dest_;
@@ -52,7 +52,7 @@ class Memset32BitValueThunk : public Thunk {
       : Thunk(Kind::kMemset32BitValue, hlo), value_(value), dest_(dest) {}
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   uint32 value_;
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
index 5283d51cd1..d3fd0544fb 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
@@ -29,8 +29,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.h b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
index 42d2d2af2e..c125474edb 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
@@ -57,8 +57,7 @@ std::ostream& operator<<(std::ostream& out,
                          const LaunchDimensions& launch_dims);
 
 LaunchDimensions CalculateLaunchDimensions(
-    const Shape& shape,
-    const perftools::gputools::DeviceDescription& device_desc,
+    const Shape& shape, const se::DeviceDescription& device_desc,
     int unroll_factor = 1);
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
index d8a43091d4..c8510808f1 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
@@ -33,8 +33,7 @@ tensorflow::Status SequentialThunk::Initialize(
 }
 
 tensorflow::Status SequentialThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
   for (const auto& thunk : thunks_) {
     TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream));
   }
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
index 32c5b748ab..df17b8d67b 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
@@ -40,8 +40,7 @@ class SequentialThunk : public Thunk {
 
   tensorflow::Status Initialize(const GpuExecutable& executable) override;
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
   // The list of sub-thunks.
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 9eea958d12..a0c785ed91 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -85,8 +85,7 @@ class Thunk {
   // This value is not required to be constant for a given Thunk.  For example,
   // a Thunk that performs autotuning may return true for its first run and
   // false thereafter.
-  virtual bool ShouldHaltAllActivityBeforeRunning(
-      perftools::gputools::Stream* /*stream*/) {
+  virtual bool ShouldHaltAllActivityBeforeRunning(se::Stream* /*stream*/) {
     return false;
   }
 
@@ -104,8 +103,7 @@ class Thunk {
   // called after Initialize and can be called multiple times over Thunk's
   // lifetime. Stream argument must be non-null.
   virtual tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) = 0;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) = 0;
 
  private:
   Kind kind_;
diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
index bd65e72393..ecb54857cc 100644
--- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
@@ -17,8 +17,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/util.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
index 3b1a496328..8b459c29a1 100644
--- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
@@ -46,8 +46,7 @@ class TupleThunk : public Thunk {
   TupleThunk& operator=(const TupleThunk&) = delete;
 
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
   const std::vector<BufferAllocation::Slice> tuple_element_buffers_;
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
index c21559af6d..a9f3d619a3 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@@ -41,8 +41,8 @@ Status WhileThunk::Initialize(const GpuExecutable& executable) {
 }
 
 Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                                   perftools::gputools::Stream* stream) {
-  perftools::gputools::DeviceMemoryBase condition_result_data =
+                                   se::Stream* stream) {
+  se::DeviceMemoryBase condition_result_data =
       buffer_allocations.GetDeviceAddress(condition_result_buffer_index_);
 
   while (true) {
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h
index 4c9f45de9e..e589ca78a7 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h
@@ -47,7 +47,7 @@ class WhileThunk : public Thunk {
 
   Status Initialize(const GpuExecutable& executable) override;
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   const BufferAllocation::Slice condition_result_buffer_index_;
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h
index 6fb91b9bef..be989846ef 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@@ -88,7 +88,7 @@ std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
 // down how much time each HLO took.
 class HloExecutionProfile {
  public:
-  using DeviceDescription = perftools::gputools::DeviceDescription;
+  using DeviceDescription = se::DeviceDescription;
 
   HloExecutionProfile(const HloProfilePrinterData* hlo_profile_printer_data,
                       const HloProfileIndexMap* hlo_profile_index_map);
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 2e834a79d9..171477299e 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -30,8 +30,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 /*static*/ StatusOr<std::unique_ptr<HloModule>>
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index f54fb44766..53f7c6fe4a 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -80,7 +80,7 @@ class HloRunner {
     bool run_hlo_passes = false;
   };
 
-  explicit HloRunner(::perftools::gputools::Platform* platform);
+  explicit HloRunner(se::Platform* platform);
 
   ~HloRunner();
 
@@ -149,8 +149,7 @@ class HloRunner {
   // will be used to configure the replication parameters. Replicated executions
   // should pass the device_assignment parameter.
   ServiceExecutableRunOptions GetServiceRunOptionsForDevice(
-      int64 device, ::perftools::gputools::Stream* stream,
-      DeviceAssignment* device_assignment);
+      int64 device, se::Stream* stream, DeviceAssignment* device_assignment);
 
   std::unique_ptr<Backend> backend_;
 };
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index 5b9bf5faf3..76b3ecad26 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -41,9 +41,6 @@ limitations under the License.
 namespace xla {
 namespace interpreter {
 
-namespace se = ::perftools::gputools;
-namespace sep = ::perftools::gputools::interpreter;
-
 Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
   HloPassPipeline pipeline("Interpreter");
 
@@ -96,7 +93,7 @@ InterpreterCompiler::CompileAheadOfTime(
 }
 
 se::Platform::Id InterpreterCompiler::PlatformId() const {
-  return sep::kXlaInterpreterPlatformId;
+  return se::interpreter::kXlaInterpreterPlatformId;
 }
 
 HloCostAnalysis::ShapeSizeFunction InterpreterCompiler::ShapeSizeBytesFunction()
@@ -109,11 +106,12 @@ static std::unique_ptr<xla::ComputationPlacer> CreateComputationPlacer() {
 }
 
 static bool InitModule() {
-  xla::Compiler::RegisterCompilerFactory(sep::kXlaInterpreterPlatformId, []() {
-    return xla::MakeUnique<xla::interpreter::InterpreterCompiler>();
-  });
+  xla::Compiler::RegisterCompilerFactory(
+      se::interpreter::kXlaInterpreterPlatformId, []() {
+        return xla::MakeUnique<xla::interpreter::InterpreterCompiler>();
+      });
   xla::ComputationPlacer::RegisterComputationPlacer(
-      sep::kXlaInterpreterPlatformId, &CreateComputationPlacer);
+      se::interpreter::kXlaInterpreterPlatformId, &CreateComputationPlacer);
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.h b/tensorflow/compiler/xla/service/interpreter/compiler.h
index c8660c04d8..e90ae3e818 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.h
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.h
@@ -44,19 +44,16 @@ class InterpreterCompiler : public Compiler {
   ~InterpreterCompiler() override {}
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
-      std::unique_ptr<HloModule> hlo_module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
-      std::unique_ptr<HloModule> hlo_module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> hlo_modules,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_exec,
+      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
@@ -65,7 +62,7 @@ class InterpreterCompiler : public Compiler {
 
   HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
 
-  perftools::gputools::Platform::Id PlatformId() const override;
+  se::Platform::Id PlatformId() const override;
 
  private:
   Status RunHloOptimization(HloModule* hlo_module);
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index 883063d0f0..acfa79ea75 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -38,8 +38,6 @@ limitations under the License.
 namespace xla {
 namespace interpreter {
 
-namespace se = ::perftools::gputools;
-
 InterpreterExecutable::InterpreterExecutable(
     std::unique_ptr<const HloModule> hlo_module)
     : Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr,
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.cc b/tensorflow/compiler/xla/service/interpreter/executor.cc
index 3caf9e7b82..97e9fa2c8e 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executor.cc
@@ -19,8 +19,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/status_macros.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {
 
 host::HostStream *AsExecutorStream(Stream *stream) {
@@ -119,5 +118,4 @@ DeviceDescription *XlaInterpreterExecutor::PopulateDeviceDescription() const {
 }
 
 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h
index 77426b0820..9b109022fb 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@@ -44,8 +44,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/timer.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {
 
 using Args = tensorflow::gtl::ArraySlice<DeviceMemoryBase>;
@@ -213,7 +212,6 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
 };
 
 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTOR_H_
diff --git a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc
index 3cf8506d1c..d27cd7502f 100644
--- a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc
@@ -21,12 +21,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 
-namespace sei = ::perftools::gputools::interpreter;
-
 namespace xla {
 
 InterpreterTransferManager::InterpreterTransferManager()
-    : GenericTransferManager(sei::kXlaInterpreterPlatformId,
+    : GenericTransferManager(se::interpreter::kXlaInterpreterPlatformId,
                              /*pointer_size=*/sizeof(void*)) {}
 
 }  // namespace xla
@@ -38,7 +36,8 @@ CreateInterpreterTransferManager() {
 
 static bool InitModule() {
   xla::TransferManager::RegisterTransferManager(
-      sei::kXlaInterpreterPlatformId, &CreateInterpreterTransferManager);
+      stream_executor::interpreter::kXlaInterpreterPlatformId,
+      &CreateInterpreterTransferManager);
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc
index 015e00e1e8..ce2f4d378c 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform.cc
@@ -28,11 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
 
-namespace se = ::perftools::gputools;
-namespace sep = ::perftools::gputools::interpreter;
-
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {
 
 XlaInterpreterPlatform::XlaInterpreterPlatform() : name_("Interpreter") {}
@@ -99,16 +95,16 @@ void XlaInterpreterPlatform::UnregisterTraceListener(TraceListener* listener) {
 }
 
 static void InitializeXlaInterpreterPlatform() {
-  std::unique_ptr<se::Platform> platform(new sep::XlaInterpreterPlatform);
-  SE_CHECK_OK(se::MultiPlatformManager::RegisterPlatform(std::move(platform)));
+  std::unique_ptr<Platform> platform(new XlaInterpreterPlatform);
+  SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }
 
 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(interpreter_platform,
-                            sep::InitializeXlaInterpreterPlatform());
+REGISTER_MODULE_INITIALIZER(
+    interpreter_platform,
+    stream_executor::interpreter::InitializeXlaInterpreterPlatform());
 
 DECLARE_MODULE_INITIALIZER(multi_platform_manager);
 
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.h b/tensorflow/compiler/xla/service/interpreter/platform.h
index 2f71b29be4..d68c5aa20d 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.h
+++ b/tensorflow/compiler/xla/service/interpreter/platform.h
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/trace_listener.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {
 
 class XlaInterpreterPlatform : public Platform {
@@ -64,7 +63,6 @@ class XlaInterpreterPlatform : public Platform {
 };
 
 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_H_
diff --git a/tensorflow/compiler/xla/service/interpreter/platform_id.cc b/tensorflow/compiler/xla/service/interpreter/platform_id.cc
index b7fb365b70..3272396ce5 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform_id.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform_id.cc
@@ -14,12 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {
 
 PLATFORM_DEFINE_ID(kXlaInterpreterPlatformId);
 
 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/service/interpreter/platform_id.h b/tensorflow/compiler/xla/service/interpreter/platform_id.h
index 292f958449..a6cc10bcc1 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform_id.h
+++ b/tensorflow/compiler/xla/service/interpreter/platform_id.h
@@ -18,14 +18,12 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {
 
 extern const Platform::Id kXlaInterpreterPlatformId;
 
 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_ID_H_
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.cc b/tensorflow/compiler/xla/service/llvm_compiler.cc
index 911b243fe2..b17c9d5045 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.cc
+++ b/tensorflow/compiler/xla/service/llvm_compiler.cc
@@ -23,7 +23,7 @@ limitations under the License.
 namespace xla {
 StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
     std::vector<std::unique_ptr<HloModule>> modules,
-    std::vector<std::vector<perftools::gputools::StreamExecutor*>> stream_execs,
+    std::vector<std::vector<se::StreamExecutor*>> stream_execs,
     DeviceMemoryAllocator* device_allocator) {
   // Tensorflow tries to enable the following behaviors in all its threads:
   //
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.h b/tensorflow/compiler/xla/service/llvm_compiler.h
index d74e81bb7f..f1c623508c 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.h
+++ b/tensorflow/compiler/xla/service/llvm_compiler.h
@@ -60,19 +60,18 @@ class LLVMCompiler : public Compiler {
   // Bring in
   //   StatusOr<std::unique_ptr<Executable>> RunBackend(
   //       std::unique_ptr<HloModule> module,
-  //       perftools::gputools::StreamExecutor* stream_exec,
+  //       se::StreamExecutor* stream_exec,
   //       DeviceMemoryAllocator* device_allocator)
   //   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
   //       std::unique_ptr<HloModule> module,
-  //       perftools::gputools::StreamExecutor* stream_exec,
+  //       se::StreamExecutor* stream_exec,
   //       DeviceMemoryAllocator* device_allocator)
   using Compiler::RunBackend;
   using Compiler::RunHloPasses;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> modules,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_execs,
+      std::vector<std::vector<se::StreamExecutor*>> stream_execs,
       DeviceMemoryAllocator* device_allocator) override;
 
  protected:
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 499f280211..0fa4061738 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -43,13 +43,11 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 /* static */ StatusOr<std::unique_ptr<LocalService>> LocalService::NewService(
     const ServiceOptions& options) {
-  perftools::gputools::Platform* platform = options.platform();
+  se::Platform* platform = options.platform();
   if (platform == nullptr) {
     TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
   }
diff --git a/tensorflow/compiler/xla/service/platform_util.cc b/tensorflow/compiler/xla/service/platform_util.cc
index aa974ee61a..7c63c0acc7 100644
--- a/tensorflow/compiler/xla/service/platform_util.cc
+++ b/tensorflow/compiler/xla/service/platform_util.cc
@@ -29,8 +29,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 using tensorflow::str_util::Lowercase;
diff --git a/tensorflow/compiler/xla/service/platform_util.h b/tensorflow/compiler/xla/service/platform_util.h
index 69188820a7..571451ba43 100644
--- a/tensorflow/compiler/xla/service/platform_util.h
+++ b/tensorflow/compiler/xla/service/platform_util.h
@@ -34,29 +34,27 @@ class PlatformUtil {
   //
   // Note that, even if a platform is present with zero devices, if we *do* have
   // compilation support for it, it will be returned in this sequence.
-  static StatusOr<std::vector<perftools::gputools::Platform*>>
-  GetSupportedPlatforms();
+  static StatusOr<std::vector<se::Platform*>> GetSupportedPlatforms();
 
   // Convenience function which returns the default supported platform for
   // tests. If exactly one supported platform is present, then this platform is
   // the default platform. If exactly two platforms are present and one of them
   // is the interpreter platform, then the other platform is the default
   // platform. Otherwise returns an error.
-  static StatusOr<perftools::gputools::Platform*> GetDefaultPlatform();
+  static StatusOr<se::Platform*> GetDefaultPlatform();
 
   // Convenience function which returns the sole supported platform. If
   // exactly one supported platform is present, then this platform is the
   // default platform. Otherwise returns an error.
-  static StatusOr<perftools::gputools::Platform*> GetSolePlatform();
+  static StatusOr<se::Platform*> GetSolePlatform();
 
   // Returns the platform according to the given name. Returns error if there is
   // no such platform.
-  static StatusOr<perftools::gputools::Platform*> GetPlatform(
-      const string& platform_name);
+  static StatusOr<se::Platform*> GetPlatform(const string& platform_name);
 
   // Returns exactly one platform that does not have given name. Returns error
   // if there is no such platform, or there are multiple such platforms.
-  static StatusOr<perftools::gputools::Platform*> GetPlatformExceptFor(
+  static StatusOr<se::Platform*> GetPlatformExceptFor(
       const string& platform_name);
 
   // Returns a vector of StreamExecutors for the given platform. The vector is
@@ -64,8 +62,8 @@ class PlatformUtil {
   // element is nullptr, then the device is present by not supported by XLA.
   //
   // If the platform has no visible devices, a not-found error is returned.
-  static StatusOr<std::vector<perftools::gputools::StreamExecutor*>>
-  GetStreamExecutors(perftools::gputools::Platform* platform);
+  static StatusOr<std::vector<se::StreamExecutor*>> GetStreamExecutors(
+      se::Platform* platform);
 
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(PlatformUtil);
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 52500e4e79..2df59c3556 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -54,8 +54,6 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 using ::tensorflow::strings::Printf;
 using ::tensorflow::strings::StrCat;
 using ::xla::source_map_util::InvalidParameterArgument;
@@ -95,15 +93,12 @@ tensorflow::Status RecordResult(const ShapedBuffer& result,
 
 }  // namespace
 
-ServiceOptions& ServiceOptions::set_platform(
-    perftools::gputools::Platform* platform) {
+ServiceOptions& ServiceOptions::set_platform(se::Platform* platform) {
   platform_ = platform;
   return *this;
 }
 
-perftools::gputools::Platform* ServiceOptions::platform() const {
-  return platform_;
-}
+se::Platform* ServiceOptions::platform() const { return platform_; }
 
 ServiceOptions& ServiceOptions::set_number_of_replicas(int number_of_replicas) {
   number_of_replicas_ = number_of_replicas;
@@ -123,7 +118,7 @@ int ServiceOptions::intra_op_parallelism_threads() const {
 }
 
 /* static */ StatusOr<std::unique_ptr<Service>> Service::NewService(
-    perftools::gputools::Platform* platform) {
+    se::Platform* platform) {
   ServiceOptions default_options;
   default_options.set_platform(platform);
   return NewService(default_options);
@@ -131,7 +126,7 @@ int ServiceOptions::intra_op_parallelism_threads() const {
 
 /* static */ StatusOr<std::unique_ptr<Service>> Service::NewService(
     const ServiceOptions& options) {
-  perftools::gputools::Platform* platform = options.platform();
+  se::Platform* platform = options.platform();
   std::unique_ptr<Backend> execute_backend;
   if (platform == nullptr) {
     TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
@@ -235,8 +230,7 @@ tensorflow::Status Service::ValidateResultShapeWithLayout(
 StatusOr<std::vector<std::vector<const ShapedBuffer*>>>
 Service::ResolveAndValidateArguments(
     tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-    tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-        stream_executors) {
+    tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors) {
   CHECK_EQ(options_.number_of_replicas(), stream_executors.size());
   std::vector<std::vector<const ShapedBuffer*>> replicated_arguments;
   replicated_arguments.resize(options_.number_of_replicas());
@@ -349,8 +343,7 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
 StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
     std::vector<VersionedComputationHandle> versioned_handles,
     std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-    Backend* backend,
-    std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+    Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
     DeviceMemoryAllocator* device_allocator) {
   VLOG(1) << Printf("BuildExecutable on service %p", this);
 
@@ -412,8 +405,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
 StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
     const std::vector<const HloModuleProto*>& module_protos,
     std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-    Backend* backend,
-    std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+    Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
     DeviceMemoryAllocator* device_allocator) {
   VLOG(1) << Printf("BuildExecutable on service %p", this);
 
@@ -493,7 +485,7 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
 StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
     const VersionedComputationHandle& versioned_handle,
     std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-    perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
+    se::StreamExecutor* executor, ExecutionProfile* profile,
     DeviceMemoryAllocator* device_allocator) {
   std::shared_ptr<Executable> executable =
       compilation_cache_.LookUp(versioned_handle, *module_config);
@@ -541,7 +533,7 @@ Service::ExecuteParallelAndRegisterResult(
   // Streams where the computation are launched, so we can wait on the streams
   // to complete.
   std::vector<Pool<se::Stream>::SmartPtr> streams;
-  std::vector<std::unique_ptr<perftools::gputools::Timer>> timers;
+  std::vector<std::unique_ptr<se::Timer>> timers;
 
   // Global data handles for the computation results, one for each computation.
   std::vector<GlobalDataHandle> result_handles;
@@ -565,8 +557,7 @@ Service::ExecuteParallelAndRegisterResult(
       streams.push_back(std::move(stream));
 
       if (replica == 0 && profile != nullptr) {
-        timers.emplace_back(
-            new perftools::gputools::Timer(streams.back()->parent()));
+        timers.emplace_back(new se::Timer(streams.back()->parent()));
         streams.back()
             ->InitTimer(timers.back().get())
             .ThenStartTimer(timers.back().get());
@@ -734,9 +725,9 @@ tensorflow::Status Service::SetReturnValue(const SetReturnValueRequest* arg,
   return computation->SetReturnValue(arg->operand());
 }
 
-StatusOr<std::vector<perftools::gputools::StreamExecutor*>>
-Service::GetExecutors(const ExecutionOptions& execution_options,
-                      int64 requests_size, int64 request_index) const {
+StatusOr<std::vector<se::StreamExecutor*>> Service::GetExecutors(
+    const ExecutionOptions& execution_options, int64 requests_size,
+    int64 request_index) const {
   if (execution_options.device_handles().empty()) {
     return FailedPrecondition(
         "device handles must be given to execute parallel computations");
@@ -748,7 +739,7 @@ Service::GetExecutors(const ExecutionOptions& execution_options,
         "handles.",
         requests_size, request_index, execution_options.device_handles_size());
   }
-  std::vector<perftools::gputools::StreamExecutor*> executors;
+  std::vector<se::StreamExecutor*> executors;
   for (const auto& device_handle : execution_options.device_handles()) {
     TF_ASSIGN_OR_RETURN(auto replicas,
                         Replicas(*execute_backend_, device_handle));
@@ -780,7 +771,7 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
   VLOG(1) << "running execute-parallel request: " << arg->ShortDebugString();
 
   std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
-  std::vector<std::vector<perftools::gputools::StreamExecutor*>> all_executors;
+  std::vector<std::vector<se::StreamExecutor*>> all_executors;
   std::vector<VersionedComputationHandle> versioned_handles;
   std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
   std::vector<string> computation_names;
@@ -891,7 +882,7 @@ tensorflow::Status Service::ExecuteGraphParallel(
   VLOG(1) << "running execute-graph-parallel request";
 
   std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
-  std::vector<std::vector<perftools::gputools::StreamExecutor*>> all_executors;
+  std::vector<std::vector<se::StreamExecutor*>> all_executors;
   std::vector<const HloModuleProto*> module_protos;
   std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
   std::vector<string> computation_names;
@@ -1953,9 +1944,9 @@ DeviceHandle Service::SingleComputationDeviceHandle() const {
   return device_handle;
 }
 
-StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Service::Replicas(
+StatusOr<std::vector<se::StreamExecutor*>> Service::Replicas(
     const Backend& backend, const DeviceHandle& device_handle) const {
-  std::vector<perftools::gputools::StreamExecutor*> replicas;
+  std::vector<se::StreamExecutor*> replicas;
   for (int replica = 0; replica < options_.number_of_replicas(); ++replica) {
     // From the computation placer, find out the device ids of the replicas for
     // the given device handle.
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index e399f1ac19..476bd0597d 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -53,8 +53,8 @@ namespace xla {
 class ServiceOptions {
  public:
   // Set the platform backing the service, or nullptr for the default platform.
-  ServiceOptions& set_platform(perftools::gputools::Platform* platform);
-  perftools::gputools::Platform* platform() const;
+  ServiceOptions& set_platform(se::Platform* platform);
+  se::Platform* platform() const;
 
   // Set the number of replicas to use when compiling replicated
   // programs.
@@ -66,7 +66,7 @@ class ServiceOptions {
   int intra_op_parallelism_threads() const;
 
  private:
-  perftools::gputools::Platform* platform_ = nullptr;
+  se::Platform* platform_ = nullptr;
   int number_of_replicas_ = 1;
   int intra_op_parallelism_threads_ = -1;
 };
@@ -79,7 +79,7 @@ class Service : public ServiceInterface {
  public:
   // Factory method for creating a new Service.
   static StatusOr<std::unique_ptr<Service>> NewService(
-      perftools::gputools::Platform* platform = nullptr);
+      se::Platform* platform = nullptr);
   static StatusOr<std::unique_ptr<Service>> NewService(
       const ServiceOptions& options);
 
@@ -286,7 +286,7 @@ class Service : public ServiceInterface {
                               ExecuteResponse* result);
 
   // Prepare the executors for executing parallel.
-  StatusOr<std::vector<perftools::gputools::StreamExecutor*>> GetExecutors(
+  StatusOr<std::vector<se::StreamExecutor*>> GetExecutors(
       const ExecutionOptions& execution_options, int64 requests_size,
       int64 request_index) const;
 
@@ -310,8 +310,7 @@ class Service : public ServiceInterface {
   StatusOr<std::vector<std::vector<const ShapedBuffer*>>>
   ResolveAndValidateArguments(
       tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-      tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-          stream_executors);
+      tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors);
 
   // Create a Hlo module config for the given program shape and arguments.
   // execution_options is optional; if not given a default is used.
@@ -329,7 +328,7 @@ class Service : public ServiceInterface {
   StatusOr<std::unique_ptr<Executable>> BuildExecutable(
       const VersionedComputationHandle& versioned_handle,
       std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-      perftools::gputools::StreamExecutor* executor,
+      se::StreamExecutor* executor,
       DeviceMemoryAllocator* device_allocator = nullptr);
 
   // Builds an Executable for the given HLO module proto.
@@ -338,7 +337,7 @@ class Service : public ServiceInterface {
   StatusOr<std::unique_ptr<Executable>> BuildExecutable(
       const HloModuleProto& module_proto,
       std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-      perftools::gputools::StreamExecutor* executor,
+      se::StreamExecutor* executor,
       DeviceMemoryAllocator* device_allocator = nullptr);
 
   // Same as BuildExecutable() above, but builds a list of Executables for the
@@ -346,14 +345,12 @@ class Service : public ServiceInterface {
   StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
       std::vector<VersionedComputationHandle> versioned_handles,
       std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-      Backend* backend,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+      Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
       DeviceMemoryAllocator* device_allocator);
   StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
       const std::vector<const HloModuleProto*>& module_protos,
       std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-      Backend* backend,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+      Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
       DeviceMemoryAllocator* device_allocator);
 
   // Similar to BuildExecutable, but look in the compilation cache for the
@@ -362,7 +359,7 @@ class Service : public ServiceInterface {
   StatusOr<std::shared_ptr<Executable>> BuildAndCacheExecutable(
       const VersionedComputationHandle& versioned_handle,
       std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-      perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
+      se::StreamExecutor* executor, ExecutionProfile* profile,
       DeviceMemoryAllocator* device_allocator = nullptr);
 
   // Runs the given executable with the given arguments and register the result
@@ -411,7 +408,7 @@ class Service : public ServiceInterface {
   // Returns the stream executors assigned to the replicas represented by the
   // given device handle. Each device_handle is a virtual replicated device that
   // represents a set of physical devices for the replicas.
-  StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Replicas(
+  StatusOr<std::vector<se::StreamExecutor*>> Replicas(
       const Backend& backend, const DeviceHandle& device_handle) const;
 
   Status MaybeDumpHloModule(const HloModule& module) const;
diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h
index 6c1f8feac7..7f3910cdb0 100644
--- a/tensorflow/compiler/xla/service/service_executable_run_options.h
+++ b/tensorflow/compiler/xla/service/service_executable_run_options.h
@@ -28,7 +28,7 @@ namespace xla {
 class ServiceExecutableRunOptions {
  public:
   using StreamBorrower =
-      std::function<StatusOr<Pool<perftools::gputools::Stream>::SmartPtr>(int)>;
+      std::function<StatusOr<Pool<se::Stream>::SmartPtr>(int)>;
 
   ServiceExecutableRunOptions()
       : ServiceExecutableRunOptions(ExecutableRunOptions()) {}
@@ -45,14 +45,13 @@ class ServiceExecutableRunOptions {
   ExecutableRunOptions* mutable_run_options() { return &run_options_; }
 
   // Delegate to `ExecutableRunOptions` member.
-  perftools::gputools::Stream* stream() const { return run_options_.stream(); }
+  se::Stream* stream() const { return run_options_.stream(); }
   DeviceMemoryAllocator* allocator() const { return run_options_.allocator(); }
   int device_ordinal() const { return run_options_.device_ordinal(); }
 
   // Borrows a stream and returns a smart pointer which returns the stream on
   // destruction.
-  StatusOr<Pool<perftools::gputools::Stream>::SmartPtr> BorrowStream(
-      int device_ordinal) const {
+  StatusOr<Pool<se::Stream>::SmartPtr> BorrowStream(int device_ordinal) const {
     return borrow_stream_
                ? borrow_stream_(device_ordinal)
                : Status(tensorflow::error::UNIMPLEMENTED, "No stream cache");
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index 6e9986165f..10a2aa2b30 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -28,8 +28,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 using ::tensorflow::strings::Appendf;
@@ -146,7 +144,7 @@ ScopedShapedBuffer::~ScopedShapedBuffer() {
 
 std::unique_ptr<ShapedBuffer> ScopedShapedBuffer::release() {
   auto shaped_buffer = MakeUnique<ShapedBuffer>(std::move(*this));
-  buffers_ = ShapeTree<perftools::gputools::DeviceMemoryBase>();
+  buffers_ = ShapeTree<se::DeviceMemoryBase>();
   return shaped_buffer;
 }
 
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index b816df8385..62ba8f2734 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -41,8 +41,7 @@ class ShapedBuffer {
   // determines the number of device allocations (DeviceMemoryBase) held by the
   // ShapedBuffer.
   ShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape,
-               const perftools::gputools::Platform* platform,
-               int device_ordinal);
+               const se::Platform* platform, int device_ordinal);
 
   // Returns the shape of the on-host representation of the data held by this
   // ShapedBuffer.
@@ -52,35 +51,29 @@ class ShapedBuffer {
   // ShapedBuffer.
   const Shape& on_device_shape() const { return on_device_shape_; }
 
-  const perftools::gputools::Platform* platform() const { return platform_; }
+  const se::Platform* platform() const { return platform_; }
   int device_ordinal() const { return device_ordinal_; }
 
   // Return the root buffer of the shape (shape index {}).
-  const perftools::gputools::DeviceMemoryBase& root_buffer() const {
+  const se::DeviceMemoryBase& root_buffer() const {
     return buffer(/*index=*/{});
   }
 
   // Returns the buffer at the given shape index where index is defined as in
   // ShapeUtil::GetSubshape.
-  const perftools::gputools::DeviceMemoryBase& buffer(
-      const ShapeIndex& index) const {
+  const se::DeviceMemoryBase& buffer(const ShapeIndex& index) const {
     return buffers_.element(index);
   }
 
   // Sets the device memory buffer at the given index.
-  void set_buffer(const perftools::gputools::DeviceMemoryBase& buffer,
-                  const ShapeIndex& index) {
+  void set_buffer(const se::DeviceMemoryBase& buffer, const ShapeIndex& index) {
     *buffers_.mutable_element(index) = buffer;
   }
 
   // Returns the underlying ShapeTree containing all the device addresses in the
   // ShapedBuffer.
-  const ShapeTree<perftools::gputools::DeviceMemoryBase>& buffers() const {
-    return buffers_;
-  }
-  ShapeTree<perftools::gputools::DeviceMemoryBase>& buffers() {
-    return buffers_;
-  }
+  const ShapeTree<se::DeviceMemoryBase>& buffers() const { return buffers_; }
+  ShapeTree<se::DeviceMemoryBase>& buffers() { return buffers_; }
 
   // Set all device memory pointers in the object to null.
   void clear();
@@ -101,13 +94,13 @@ class ShapedBuffer {
   Shape on_device_shape_;
 
   // The platform the memory is allocated on.
-  const perftools::gputools::Platform* platform_;
+  const se::Platform* platform_;
 
   // The device the memory is allocated on.
   int device_ordinal_;
 
   // The tree of device buffers. Its shape is on_device_shape().
-  ShapeTree<perftools::gputools::DeviceMemoryBase> buffers_;
+  ShapeTree<se::DeviceMemoryBase> buffers_;
 };
 
 std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer);
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index 2f36e2b16e..be8231b73c 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -25,24 +25,20 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 /* static */ tensorflow::mutex
     TransferManager::platform_transfer_manager_mutex_(
         tensorflow::LINKER_INITIALIZED);
 
-/* static */ std::map<perftools::gputools::Platform::Id,
-                      TransferManager::State>*
+/* static */ std::map<se::Platform::Id, TransferManager::State>*
 TransferManager::GetPlatformTransferManagers() {
-  static auto* r =
-      new std::map<perftools::gputools::Platform::Id, TransferManager::State>;
+  static auto* r = new std::map<se::Platform::Id, TransferManager::State>;
   return r;
 }
 
 Status TransferManager::TransferArrayToDevice(
-    perftools::gputools::StreamExecutor* executor, const Literal& literal,
-    const perftools::gputools::DeviceMemoryBase& dest) {
+    se::StreamExecutor* executor, const Literal& literal,
+    const se::DeviceMemoryBase& dest) {
   const Shape on_device_shape = HostShapeToDeviceShape(literal.shape());
   TF_RET_CHECK(ShapeUtil::IsArray(on_device_shape))
       << "On-device representation of "
@@ -61,8 +57,8 @@ Status TransferManager::TransferArrayToDevice(
 }
 
 StatusOr<std::unique_ptr<Literal>> TransferManager::TransferArrayFromDevice(
-    perftools::gputools::StreamExecutor* executor, const Shape& shape,
-    const perftools::gputools::DeviceMemoryBase& source) {
+    se::StreamExecutor* executor, const Shape& shape,
+    const se::DeviceMemoryBase& source) {
   TF_RET_CHECK(ShapeUtil::Equal(HostShapeToDeviceShape(shape), shape))
       << "Shape " << ShapeUtil::HumanString(shape)
       << " has a differently shaped representation on-device: "
@@ -112,8 +108,7 @@ StatusOr<std::unique_ptr<Literal>> TransferManager::TransferArrayFromDevice(
 }
 
 Status TransferManager::WriteTupleIndexTables(
-    perftools::gputools::StreamExecutor* executor,
-    const ShapedBuffer& device_buffer) {
+    se::StreamExecutor* executor, const ShapedBuffer& device_buffer) {
   VLOG(2) << "Writing tuple index tables for " << device_buffer;
 
   TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal());
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index 9f2b5c4aec..410d2af7af 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -42,7 +42,7 @@ class TransferManager {
   virtual ~TransferManager() {}
 
   // Returns the ID of the platform that this transfer manager acts on.
-  virtual perftools::gputools::Platform::Id PlatformId() const = 0;
+  virtual se::Platform::Id PlatformId() const = 0;
 
   // Returns the shape of the on-device representation for the given shape on
   // the host. This is intended for use with ShapedBuffer where buffers are
@@ -58,48 +58,45 @@ class TransferManager {
   // DeviceShape(literal_shape) must be compatible, but need not have the same
   // layout.
   virtual StatusOr<std::unique_ptr<Literal>> TransferLiteralFromDevice(
-      perftools::gputools::StreamExecutor* executor,
-      const ShapedBuffer& device_buffer) = 0;
+      se::StreamExecutor* executor, const ShapedBuffer& device_buffer) = 0;
 
   // Transfers the given literal into the previously allocated device memory
   // represented by the given ShapedBuffer using the given executor. The shape
   // of the ShapedBuffer and DeviceShape(literal.shape()) must be compatible,
   // but need not have the same layout
-  virtual Status TransferLiteralToDevice(
-      perftools::gputools::StreamExecutor* executor, const Literal& literal,
-      const ShapedBuffer& device_buffer) = 0;
+  virtual Status TransferLiteralToDevice(se::StreamExecutor* executor,
+                                         const Literal& literal,
+                                         const ShapedBuffer& device_buffer) = 0;
 
   // Convenience methods for transferring an array to or from the device at a
   // known address. This avoids having to construct a ShapedBuffer just to
   // transfer an array at a known address.
-  Status TransferArrayToDevice(
-      perftools::gputools::StreamExecutor* executor, const Literal& literal,
-      const perftools::gputools::DeviceMemoryBase& dest);
+  Status TransferArrayToDevice(se::StreamExecutor* executor,
+                               const Literal& literal,
+                               const se::DeviceMemoryBase& dest);
   StatusOr<std::unique_ptr<Literal>> TransferArrayFromDevice(
-      perftools::gputools::StreamExecutor* executor, const Shape& shape,
-      const perftools::gputools::DeviceMemoryBase& source);
+      se::StreamExecutor* executor, const Shape& shape,
+      const se::DeviceMemoryBase& source);
 
   // Transfers the given literal into the Infeed interface of the device,
   // using the given executor.
-  virtual Status TransferLiteralToInfeed(
-      perftools::gputools::StreamExecutor* executor,
-      const Literal& literal) = 0;
+  virtual Status TransferLiteralToInfeed(se::StreamExecutor* executor,
+                                         const Literal& literal) = 0;
 
   // Transfers the given literal from the Outfeed interface of the device,
   // using the given executor.
-  virtual Status TransferLiteralFromOutfeed(
-      perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
-      Literal* literal) = 0;
+  virtual Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
+                                            const Shape& literal_shape,
+                                            Literal* literal) = 0;
 
   // Resets the devices associated with this transfer manager.
   virtual Status ResetDevices(
-      tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-          executor) = 0;
+      tensorflow::gtl::ArraySlice<se::StreamExecutor*> executor) = 0;
 
   // Given an allocated ShapedBuffer, constructs the tuple index table(s) in
   // each buffer of the given ShapedBuffer corresponding to tuple shapes. If the
   // ShapedBuffer is array-shaped this method does nothing.
-  Status WriteTupleIndexTables(perftools::gputools::StreamExecutor* executor,
+  Status WriteTupleIndexTables(se::StreamExecutor* executor,
                                const ShapedBuffer& device_buffer);
 
   // Determines the byte size requirement for the given shape on the underlying
@@ -127,13 +124,13 @@ class TransferManager {
   // Precondition: a platform kind must not be registered more than once.
   typedef std::unique_ptr<TransferManager> (*TransferManagerCreationFunction)();
   static void RegisterTransferManager(
-      perftools::gputools::Platform::Id platform_id,
+      se::Platform::Id platform_id,
       TransferManagerCreationFunction transfer_manager);
 
   // Returns the transfer manager singleton pointer if it is available for the
   // given platform, or an error status if it is not.
   static StatusOr<TransferManager*> GetForPlatform(
-      const perftools::gputools::Platform* platform);
+      const se::Platform* platform);
 
  protected:
   // Transfer a memory block of the given size from 'source' buffer to the
@@ -143,35 +140,32 @@ class TransferManager {
   //
   // source is the source data that must be in the target-dependent layout that
   // the Infeed HLO used in the computation expects.
-  virtual Status TransferBufferToInfeed(
-      perftools::gputools::StreamExecutor* executor, int64 size,
-      const void* source) = 0;
+  virtual Status TransferBufferToInfeed(se::StreamExecutor* executor,
+                                        int64 size, const void* source) = 0;
 
   // Transfer a memory block of the given size from the device source into the
   // 'destination' buffer.
   //
   // size is the size to transfer to destination in bytes.
-  virtual Status TransferBufferFromDevice(
-      perftools::gputools::StreamExecutor* executor,
-      const perftools::gputools::DeviceMemoryBase& source, int64 size,
-      void* destination);
+  virtual Status TransferBufferFromDevice(se::StreamExecutor* executor,
+                                          const se::DeviceMemoryBase& source,
+                                          int64 size, void* destination);
 
   // Transfer a memory block of the given size from 'source' buffer to the given
   // destination of the device.
   //
   // size is the size to transfer from source in bytes.
-  virtual Status TransferBufferToDevice(
-      perftools::gputools::StreamExecutor* executor, int64 size,
-      const void* source, perftools::gputools::DeviceMemoryBase* destination);
+  virtual Status TransferBufferToDevice(se::StreamExecutor* executor,
+                                        int64 size, const void* source,
+                                        se::DeviceMemoryBase* destination);
 
   // Writes the given device-memory pointers in 'elements' to the given region
   // to construct a tuple index table in the platform-specific tuple
   // representation.
   virtual Status WriteSingleTupleIndexTable(
-      perftools::gputools::StreamExecutor* executor,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          elements,
-      const Shape& shape, perftools::gputools::DeviceMemoryBase* region) = 0;
+      se::StreamExecutor* executor,
+      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> elements,
+      const Shape& shape, se::DeviceMemoryBase* region) = 0;
 
  private:
   // The mutex that guards the platform-to-transfer manager map.
@@ -186,8 +180,7 @@ class TransferManager {
   };
 
   // Map from platform kind to transfer manager singleton.
-  static std::map<perftools::gputools::Platform::Id, State>*
-  GetPlatformTransferManagers();
+  static std::map<se::Platform::Id, State>* GetPlatformTransferManagers();
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
index 777ac167a3..bff60f25ec 100644
--- a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
+++ b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
@@ -34,7 +34,7 @@ namespace {
 
 class BitcastConvertTest : public ClientLibraryTestBase {
  public:
-  explicit BitcastConvertTest(perftools::gputools::Platform* platform = nullptr)
+  explicit BitcastConvertTest(se::Platform* platform = nullptr)
       : ClientLibraryTestBase(platform) {
     mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
     mutable_debug_options()->add_xla_disable_hlo_passes("inline");
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index 312d8f284d..69389dae3f 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -32,8 +32,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace {
 
@@ -59,8 +57,7 @@ se::Platform* GetReferencePlatform() {
 }  // namespace
 
 ClientLibraryTestBase::ClientLibraryTestBase(
-    perftools::gputools::Platform* platform,
-    const LocalClientOptions& client_options)
+    se::Platform* platform, const LocalClientOptions& client_options)
     : client_(GetOrCreateLocalClientOrDie(client_options)),
       execution_options_(CreateDefaultExecutionOptions()) {
   CHECK_EQ(platform, client_options.platform());
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index b3212dd228..481d7c5c25 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -64,11 +64,10 @@ std::vector<TestCase> ExpandUseBfloat16(
 // A client library test establishes an in-process XLA client connection.
 class ClientLibraryTestBase : public ::testing::Test {
  protected:
-  explicit ClientLibraryTestBase(
-      perftools::gputools::Platform* platform = nullptr);
+  explicit ClientLibraryTestBase(se::Platform* platform = nullptr);
 
   // Creates a new ClientLibraryTestBase with custom client options.
-  ClientLibraryTestBase(perftools::gputools::Platform* platform,
+  ClientLibraryTestBase(se::Platform* platform,
                         const LocalClientOptions& client_options);
 
   // Returns the name of the test currently being run.
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index c15d808f1d..7ea82a791f 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -47,16 +47,14 @@ ClientType client_types[] = {ClientType::kLocal, ClientType::kCompileOnly};
 
 class ComputeConstantTest : public ::testing::Test {
  public:
-  explicit ComputeConstantTest(
-      perftools::gputools::Platform* platform = nullptr)
+  explicit ComputeConstantTest(se::Platform* platform = nullptr)
       : platform_(platform) {}
 
   string TestName() const {
     return ::testing::UnitTest::GetInstance()->current_test_info()->name();
   }
 
-  Client* ClientOrDie(::perftools::gputools::Platform* platform,
-                      ClientType client_type) {
+  Client* ClientOrDie(se::Platform* platform, ClientType client_type) {
     if (client_type == ClientType::kLocal) {
       StatusOr<Client*> result =
           ClientLibrary::GetOrCreateLocalClient(platform);
@@ -107,7 +105,7 @@ class ComputeConstantTest : public ::testing::Test {
     return result.ok() ? result.ValueOrDie() : false;
   }
 
-  perftools::gputools::Platform* platform_;
+  se::Platform* platform_;
 };
 
 TEST_F(ComputeConstantTest, ScalarInt32Literal) {
diff --git a/tensorflow/compiler/xla/tests/convert_test.cc b/tensorflow/compiler/xla/tests/convert_test.cc
index 0842a8918b..e67a30d76c 100644
--- a/tensorflow/compiler/xla/tests/convert_test.cc
+++ b/tensorflow/compiler/xla/tests/convert_test.cc
@@ -36,7 +36,7 @@ namespace {
 
 class ConvertTest : public ClientLibraryTestBase {
  public:
-  explicit ConvertTest(perftools::gputools::Platform* platform = nullptr)
+  explicit ConvertTest(se::Platform* platform = nullptr)
       : ClientLibraryTestBase(platform) {
     mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
     mutable_debug_options()->add_xla_disable_hlo_passes("inline");
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 5f00c34002..464b8cbebb 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -35,8 +35,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace {
 
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index a292eab1d1..ed16963b40 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -50,8 +50,6 @@ limitations under the License.
 
 using tensorflow::gtl::ArraySlice;
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace {
 
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 21f71fc91b..c5afe0c3e0 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -35,8 +35,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 namespace {
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 3e8e2360bb..28d7ab09cb 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -76,8 +76,7 @@ class HloTestBase : public ::testing::Test {
   // If your test doesn't use interpreter as the reference backend, you can use
   // this constructor. Note that your test target is responsible for linking in
   // both needed backends.
-  HloTestBase(::perftools::gputools::Platform* test_platform,
-              ::perftools::gputools::Platform* reference_platform);
+  HloTestBase(se::Platform* test_platform, se::Platform* reference_platform);
 
   ~HloTestBase() override {}
 
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index 7e92439c49..2f46ee0be2 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -43,7 +43,7 @@ class LLVMCompilerTest : public ::testing::Test {
   ~LLVMCompilerTest() override {}
 
  protected:
-  using Platform = ::perftools::gputools::Platform;
+  using Platform = se::Platform;
 
   explicit LLVMCompilerTest(string platform_name)
       : platform_name_(std::move(platform_name)) {}
@@ -95,7 +95,7 @@ class LLVMCompilerTest : public ::testing::Test {
     modules.push_back(hlo_module->Clone());
     modules.push_back(std::move(hlo_module));
 
-    std::vector<std::vector<perftools::gputools::StreamExecutor *>> executors;
+    std::vector<std::vector<se::StreamExecutor *>> executors;
     executors.push_back({backend_->default_stream_executor()});
     executors.push_back({backend_->default_stream_executor()});
 
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index 2462ea39f9..373dd3c5df 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -43,8 +43,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace {
 
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index 96b976d25d..29fd985acf 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -35,8 +35,9 @@ namespace xla {
 
 /* static */ TestAllocator* LocalClientTestBase::allocator_;
 
-StatusOr<perftools::gputools::DeviceMemoryBase> TestAllocator::Allocate(
-    int device_ordinal, uint64 size, bool retry_on_failure) {
+StatusOr<se::DeviceMemoryBase> TestAllocator::Allocate(int device_ordinal,
+                                                       uint64 size,
+                                                       bool retry_on_failure) {
   VLOG(2) << "Allocate(" << device_ordinal << ", " << size << ")";
   {
     tensorflow::mutex_lock lock(count_mutex_);
@@ -46,8 +47,8 @@ StatusOr<perftools::gputools::DeviceMemoryBase> TestAllocator::Allocate(
   return StreamExecutorMemoryAllocator::Allocate(device_ordinal, size);
 }
 
-tensorflow::Status TestAllocator::Deallocate(
-    int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) {
+tensorflow::Status TestAllocator::Deallocate(int device_ordinal,
+                                             se::DeviceMemoryBase* mem) {
   VLOG(2) << "Deallocate(" << device_ordinal << ")";
   {
     tensorflow::mutex_lock lock(count_mutex_);
@@ -88,7 +89,7 @@ int64 TestAllocator::deallocation_count(int device_ordinal) const {
 }
 
 /* static */ TestAllocator* LocalClientTestBase::GetOrCreateAllocator(
-    perftools::gputools::Platform* platform) {
+    se::Platform* platform) {
   static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
   tensorflow::mutex_lock lock(mu);
 
@@ -115,8 +116,7 @@ struct LocalClientTestBase::EigenThreadPoolWrapper {
   std::unique_ptr<Eigen::ThreadPoolDevice> device;
 };
 
-LocalClientTestBase::LocalClientTestBase(
-    perftools::gputools::Platform* platform)
+LocalClientTestBase::LocalClientTestBase(se::Platform* platform)
     : local_client_(
           ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie()),
       thread_pool_wrapper_(new EigenThreadPoolWrapper()) {
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h
index f0c73f04f6..7555d5e893 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@@ -41,15 +41,15 @@ namespace xla {
 
 class TestAllocator : public StreamExecutorMemoryAllocator {
  public:
-  explicit TestAllocator(perftools::gputools::Platform* platform)
+  explicit TestAllocator(se::Platform* platform)
       : StreamExecutorMemoryAllocator(
             platform, PlatformUtil::GetStreamExecutors(platform).ValueOrDie()) {
   }
 
-  StatusOr<perftools::gputools::DeviceMemoryBase> Allocate(
-      int device_ordinal, uint64 size, bool retry_on_failure) override;
-  tensorflow::Status Deallocate(
-      int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) override;
+  StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size,
+                                          bool retry_on_failure) override;
+  tensorflow::Status Deallocate(int device_ordinal,
+                                se::DeviceMemoryBase* mem) override;
 
   // Return the number of allocations that have been performed.
   int64 allocation_count() const;
@@ -75,12 +75,10 @@ class TestAllocator : public StreamExecutorMemoryAllocator {
 class LocalClientTestBase : public ::testing::Test {
  protected:
   struct EigenThreadPoolWrapper;
-  explicit LocalClientTestBase(
-      perftools::gputools::Platform* platform = nullptr);
+  explicit LocalClientTestBase(se::Platform* platform = nullptr);
   virtual ~LocalClientTestBase();
 
-  static TestAllocator* GetOrCreateAllocator(
-      perftools::gputools::Platform* platform);
+  static TestAllocator* GetOrCreateAllocator(se::Platform* platform);
 
   // Copy the given literal onto the default device and return a
   // ScopedShapedBuffer. Convenience wrapper around
@@ -128,7 +126,7 @@ class LocalClientTestBase : public ::testing::Test {
   // of the process. So make the allocator static.
   static TestAllocator* allocator_;
 
-  perftools::gputools::StreamExecutor* stream_executor_;
+  se::StreamExecutor* stream_executor_;
   TransferManager* transfer_manager_;
 
   LocalClient* local_client_;
diff --git a/tensorflow/compiler/xla/tests/map_test.cc b/tensorflow/compiler/xla/tests/map_test.cc
index efe6cc6787..8fabcaca1b 100644
--- a/tensorflow/compiler/xla/tests/map_test.cc
+++ b/tensorflow/compiler/xla/tests/map_test.cc
@@ -41,7 +41,7 @@ namespace {
 
 class MapTest : public ClientLibraryTestBase {
  public:
-  explicit MapTest(perftools::gputools::Platform* platform = nullptr)
+  explicit MapTest(se::Platform* platform = nullptr)
       : ClientLibraryTestBase(platform) {
     mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
     mutable_debug_options()->add_xla_disable_hlo_passes("inline");
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index cda1989fad..997a1d8273 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -339,8 +339,8 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
   return std::move(arguments);
 }
 
-Status VerifyHloModule(const perftools::gputools::Platform& platform,
-                       HloModule* const module, bool allow_mixed_precision) {
+Status VerifyHloModule(const se::Platform& platform, HloModule* const module,
+                       bool allow_mixed_precision) {
   return HloVerifier(allow_mixed_precision).Run(module).status();
 }
 
diff --git a/tensorflow/compiler/xla/tests/test_utils.h b/tensorflow/compiler/xla/tests/test_utils.h
index b5ab779574..30c147910c 100644
--- a/tensorflow/compiler/xla/tests/test_utils.h
+++ b/tensorflow/compiler/xla/tests/test_utils.h
@@ -68,8 +68,7 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
 
 // Check that a given module satisfies various constraints before trying to
 // execute it.
-Status VerifyHloModule(const perftools::gputools::Platform& platform,
-                       HloModule* const module,
+Status VerifyHloModule(const se::Platform& platform, HloModule* const module,
                        bool allow_mixed_precision = false);
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
index b52c718814..697d78fe6e 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
@@ -39,7 +39,7 @@ namespace {
 
 class VecOpsSimpleTest : public ClientLibraryTestBase {
  public:
-  explicit VecOpsSimpleTest(perftools::gputools::Platform* platform = nullptr)
+  explicit VecOpsSimpleTest(se::Platform* platform = nullptr)
       : ClientLibraryTestBase(platform) {
     mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
     mutable_debug_options()->add_xla_disable_hlo_passes("inline");
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 89ce2ce797..1e18b56799 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -37,8 +37,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace {
 
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index ff3418a128..efb00d56c5 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -34,7 +34,7 @@ limitations under the License.
 
 namespace xla {
 namespace {
-namespace se = ::perftools::gputools;
+
 namespace gtl = ::tensorflow::gtl;
 
 class HloProfileTest : public ClientLibraryTestBase {};
diff --git a/tensorflow/compiler/xla/types.h b/tensorflow/compiler/xla/types.h
index 20f3f1b957..b645acb700 100644
--- a/tensorflow/compiler/xla/types.h
+++ b/tensorflow/compiler/xla/types.h
@@ -49,9 +49,7 @@ using ::Eigen::half;
 // Alias namespace ::stream_executor as ::xla::se.
 namespace stream_executor {}
 namespace xla {
-// TODO(b/77980417): Uncomment this once all namespace aliases named 'se' are
-// removed in ::xla.
-// namespace se = ::stream_executor;
+namespace se = ::stream_executor;
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_TYPES_H_
-- 
GitLab


From c72e6858b48d9104b718d4320454d47fde8fff4e Mon Sep 17 00:00:00 2001
From: Noah Eisen <ncteisen@google.com>
Date: Tue, 17 Apr 2018 23:06:57 -0700
Subject: [PATCH 062/450] No public changes.

PiperOrigin-RevId: 193309262
---
 tensorflow/core/distributed_runtime/rpc/grpc_util.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
index ece56a2727..e211c33732 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
@@ -21,8 +21,8 @@ namespace tensorflow {
 ::grpc::Status GrpcMaybeUnparseProto(const protobuf::Message& src,
                                      grpc::ByteBuffer* dst) {
   bool own_buffer;
-  return ::grpc::GenericSerialize<::grpc::ProtoBufferWriter, protobuf::Message>(
-      src, dst, &own_buffer);
+  return ::grpc::GenericSerialize<::grpc::ProtoBufferWriter,
+                                  protobuf::Message>(src, dst, &own_buffer);
 }
 
 // GrpcMaybeUnparseProto from a string simply copies the string to the
@@ -35,7 +35,7 @@ namespace tensorflow {
 }
 
 bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, protobuf::Message* dst) {
-  grpc::ProtoBufferReader reader(src);
+  ::grpc::ProtoBufferReader reader(src);
   return dst->ParseFromZeroCopyStream(&reader);
 }
 
-- 
GitLab


From 069756ce00faf1d1d34ccfdd45163d9a9af6c61b Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 18 Apr 2018 00:18:50 -0700
Subject: [PATCH 063/450] Enable the n=1 special case in the DeserializeSparse
 op.

The optimized case was previously dead because of two off-by-one errors (mea culpa).

PiperOrigin-RevId: 193314065
---
 tensorflow/core/kernels/serialize_sparse_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/serialize_sparse_op.cc b/tensorflow/core/kernels/serialize_sparse_op.cc
index 64e0a68c2c..9e041d98f7 100644
--- a/tensorflow/core/kernels/serialize_sparse_op.cc
+++ b/tensorflow/core/kernels/serialize_sparse_op.cc
@@ -340,7 +340,7 @@ class DeserializeSparseOp : public OpKernel {
             "but has a zero dimension ",
             serialized_sparse.shape().DebugString()));
 
-    if (num_sparse_tensors == 0 && serialized_sparse.shape().dims() == 1) {
+    if (num_sparse_tensors == 1 && serialized_sparse.shape().dims() == 0) {
       // Special case with a single sparse tensor. We can avoid data
       // motion in the Concat and Reshape.
       const auto& serialized_sparse_t = serialized_sparse.vec<T>();
-- 
GitLab


From cf836c4f6b3067bda14dd0ee8455d99c19423d5a Mon Sep 17 00:00:00 2001
From: Junpeng Lao <junpeng.lao@unifr.ch>
Date: Wed, 18 Apr 2018 10:45:49 +0200
Subject: [PATCH 064/450] Add test

---
 .../python/kernel_tests/bijectors/ordered_test.py  | 14 +++++++-------
 .../distributions/python/ops/bijectors/ordered.py  |  8 +++++---
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
index 1bcbfed6c3..2d49b4294e 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
@@ -38,12 +38,12 @@ class OrderedBijectorTest(test.TestCase):
     with self.test_session():
       ordered = Ordered()
       self.assertEqual("ordered", ordered.name)
-      x = np.log([[2., 3, 4], [4., 8, 12]])
-      y = [[0.2, 0.3, 0.4, 0.1], [0.16, 0.32, 0.48, 0.04]]
+      x = np.asarray([[2., 3, 4], [4., 8, 13]])
+      y = [[2., 0, 0], [4., np.log(4.), np.log(5.)]]
       self.assertAllClose(y, ordered.forward(x).eval())
       self.assertAllClose(x, ordered.inverse(y).eval())
       self.assertAllClose(
-          -np.sum(np.log(y), axis=1),
+          -np.sum(y[..., 1:], axis=-1),
           ordered.inverse_log_det_jacobian(y, event_ndims=1).eval(),
           atol=0.,
           rtol=1e-7)
@@ -58,15 +58,15 @@ class OrderedBijectorTest(test.TestCase):
       ordered = Ordered()
       self.assertEqual("ordered", ordered.name)
       x = array_ops.placeholder(shape=[2, None], dtype=dtypes.float32)
-      real_x = np.log([[2., 3, 4], [4., 8, 12]])
+      real_x = np.asarray([[2., 3, 4], [4., 8, 13]])
       y = array_ops.placeholder(shape=[2, None], dtype=dtypes.float32)
-      real_y = [[0.2, 0.3, 0.4, 0.1], [0.16, 0.32, 0.48, 0.04]]
+      real_y = [[2., 0, 0], [4., np.log(4.), np.log(5.)]]
       self.assertAllClose(real_y, ordered.forward(x).eval(
           feed_dict={x: real_x}))
       self.assertAllClose(real_x, ordered.inverse(y).eval(
           feed_dict={y: real_y}))
       self.assertAllClose(
-          -np.sum(np.log(real_y), axis=1),
+          -np.sum(y[..., 1:], axis=-1),
           ordered.inverse_log_det_jacobian(y, event_ndims=1).eval(
               feed_dict={y: real_y}),
           atol=0.,
@@ -82,7 +82,7 @@ class OrderedBijectorTest(test.TestCase):
   def testShapeGetters(self):
     with self.test_session():
       x = tensor_shape.TensorShape([4])
-      y = tensor_shape.TensorShape([5])
+      y = tensor_shape.TensorShape([4])
       bijector = Ordered(validate_args=True)
       self.assertAllEqual(y, bijector.forward_event_shape(x))
       self.assertAllEqual(y.as_list(),
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
index ec8f660144..64cf2e6b56 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
@@ -37,6 +37,9 @@ class Ordered(bijector.Bijector):
   """Bijector which maps a tensor x_k that has increasing elements in the last
   dimension to an unconstrained tensor y_k.
 
+  The inverse of the bijector applied to a normal random vector `X ~ N(0, 1)`
+  gives back a sorted random vector with the same distribution `Y ~ N(0, 1)`
+
   On the last dimension of the tensor, Ordered bijector performs:
   `y[0] = x[0]`
   `y[1:] = math_ops.log(x[1:] - x[:-1])`
@@ -79,7 +82,6 @@ class Ordered(bijector.Bijector):
 
   def _inverse_event_shape_tensor(self, output_shape):
     if self.validate_args:
-      # It is not possible for a negative shape so we need only check <= 1.
       is_greater_one = check_ops.assert_greater(
           output_shape[-1], 1, message="Need last dimension greater than 1.")
       output_shape = control_flow_ops.with_dependencies(
@@ -108,7 +110,7 @@ class Ordered(bijector.Bijector):
   def _maybe_assert_valid_x(self, x):
     if not self.validate_args:
       return x
-    is_valid = check_ops.is_strictly_increasing(
-        x,
+    is_valid = check_ops.assert_positive(
+        x[..., 1:] - x[..., :-1],
         message="Forward transformation input must be strictly increasing.")
     return control_flow_ops.with_dependencies([is_valid], x)
\ No newline at end of file
-- 
GitLab


From 5c52028c7337baafd8d92d36a29e0fa088393d06 Mon Sep 17 00:00:00 2001
From: Junpeng Lao <junpeng.lao@unifr.ch>
Date: Wed, 18 Apr 2018 12:38:41 +0200
Subject: [PATCH 065/450] add forward logdet jacobian

---
 .../kernel_tests/bijectors/ordered_test.py     | 14 ++++----------
 .../python/ops/bijectors/ordered.py            | 18 ++++++++++--------
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
index 2d49b4294e..63c8f1fb31 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
@@ -43,7 +43,7 @@ class OrderedBijectorTest(test.TestCase):
       self.assertAllClose(y, ordered.forward(x).eval())
       self.assertAllClose(x, ordered.inverse(y).eval())
       self.assertAllClose(
-          -np.sum(y[..., 1:], axis=-1),
+          np.sum(np.asarray(y)[..., 1:], axis=-1),
           ordered.inverse_log_det_jacobian(y, event_ndims=1).eval(),
           atol=0.,
           rtol=1e-7)
@@ -66,7 +66,7 @@ class OrderedBijectorTest(test.TestCase):
       self.assertAllClose(real_x, ordered.inverse(y).eval(
           feed_dict={y: real_y}))
       self.assertAllClose(
-          -np.sum(y[..., 1:], axis=-1),
+          np.sum(np.asarray(real_y)[..., 1:], axis=-1),
           ordered.inverse_log_det_jacobian(y, event_ndims=1).eval(
               feed_dict={y: real_y}),
           atol=0.,
@@ -96,14 +96,8 @@ class OrderedBijectorTest(test.TestCase):
   def testBijectiveAndFinite(self):
     with self.test_session():
       ordered = Ordered()
-      x = np.linspace(-50, 50, num=10).reshape(5, 2).astype(np.float32)
-      # Make y values on the simplex with a wide range.
-      y_0 = np.ones(5).astype(np.float32)
-      y_1 = (1e-5 * rng.rand(5)).astype(np.float32)
-      y_2 = (1e1 * rng.rand(5)).astype(np.float32)
-      y = np.array([y_0, y_1, y_2])
-      y /= y.sum(axis=0)
-      y = y.T  # y.shape = [5, 3]
+      x = np.sort(rng.randn(3, 10), axis=-1).astype(np.float32)
+      y = (rng.randn(3, 10)).astype(np.float32)
       assert_bijective_and_finite(ordered, x, y, event_ndims=1)
 
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
index 64cf2e6b56..b2959cce31 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
@@ -24,7 +24,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops.distributions import bijector
 
 
@@ -37,8 +36,9 @@ class Ordered(bijector.Bijector):
   """Bijector which maps a tensor x_k that has increasing elements in the last
   dimension to an unconstrained tensor y_k.
 
-  The inverse of the bijector applied to a normal random vector `X ~ N(0, 1)`
-  gives back a sorted random vector with the same distribution `Y ~ N(0, 1)`
+  The inverse of the bijector applied to a normal random vector `y ~ N(0, 1)`
+  gives back a sorted random vector with the same distribution `x ~ N(0, 1)`
+  where `x = sort(y)`
 
   On the last dimension of the tensor, Ordered bijector performs:
   `y[0] = x[0]`
@@ -47,11 +47,11 @@ class Ordered(bijector.Bijector):
   Example Use:
 
   ```python
-  bijector.Ordered().forward(tf.log([2, 3, 4]))
-  # Result: [0.6931472, 3.6931472, 7.693147]
+  bijector.Ordered().forward([2, 3, 4])
+  # Result: [2., 0., 0.]
 
-  bijector.Ordered().inverse([0.2, 0.3, 0.4])
-  # Result: tf.log([2, 3, 4])
+  bijector.Ordered().inverse([0.06428002, -1.07774478, -0.71530371])
+  # Result: [0.06428002, 0.40464228, 0.8936858]
   ```
   """
 
@@ -105,7 +105,9 @@ class Ordered(bijector.Bijector):
     return math_ops.reduce_sum(y[..., 1:], axis=-1)
 
   def _forward_log_det_jacobian(self, x):
-    pass
+    return -math_ops.reduce_sum(
+      math_ops.log(x[..., 1:] - x[..., :-1]),
+      axis=-1)
 
   def _maybe_assert_valid_x(self, x):
     if not self.validate_args:
-- 
GitLab


From c8e118877cb9e6d201a64f5627de72877bcb8da6 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Wed, 18 Apr 2018 08:10:05 -0700
Subject: [PATCH 066/450] Fix bug in importing while loops within a while loop.

PiperOrigin-RevId: 193358699
---
 .../python/framework/meta_graph_test.py       | 25 +++++++++++++++++++
 tensorflow/python/ops/control_flow_ops.py     | 10 +++++++-
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index 5d5fb037fc..e5b157648e 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -522,6 +522,31 @@ class ScopedMetaGraphTest(test.TestCase):
         actual_grad_value = sess.run(grad)
         self.assertEqual(expected_grad_value, actual_grad_value)
 
+  def testImportWhileLoopInWhileLoop(self):
+    # Create a simple while loop.
+    with ops.Graph().as_default():
+      var = variables.Variable(0.0)
+      _, output = control_flow_ops.while_loop(lambda i, x: i < 5,
+                                              lambda i, x: (i + 1, x * 2.0),
+                                              [0, var])
+      output_name = output.name
+
+      # Generate a MetaGraphDef containing the while loop with an export scope.
+      meta_graph_def, _ = meta_graph.export_scoped_meta_graph()
+
+    # Restore the MetaGraphDef in a while loop in a new graph.
+    with ops.Graph().as_default():
+
+      def body(i, _):
+        meta_graph.import_scoped_meta_graph(meta_graph_def)
+        return i + 1, ops.get_default_graph().get_tensor_by_name(output_name)
+
+      _, x = control_flow_ops.while_loop(lambda i, x: i < 2, body, [0, 0.0],
+                                         name="")
+      with session.Session() as sess:
+        sess.run(variables.global_variables_initializer())
+        sess.run(x)
+
   def testScopedImportUnderNameScope(self):
     graph = ops.Graph()
     with graph.as_default():
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index fb53d9ffea..c43bbd4a1e 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -2379,7 +2379,15 @@ class WhileContext(ControlFlowContext):
   def AddValue(self, val):
     """Add `val` to the current context and its outer context recursively."""
     result = val
-    if val.name not in self._values:
+    new_value = val.name not in self._values
+    # Don't treat ops in this context as new values. Usually all known values
+    # are in self._values, except when we're importing a while loop inside this
+    # WhileContext. Since there's a cycle in this case, `val` may be part of the
+    # imported while loop but not yet processed by this context and added to
+    # self._values in _AddOpInternal. We only want to process external input
+    # tensors to the while loop here.
+    new_value &= val.op._control_flow_context is not self  # pylint: disable=protected-access
+    if new_value:
       self._values.add(val.name)
 
       # If we are in a grad context and val is from its forward context,
-- 
GitLab


From 39047daafcf12864606a2c7e349eacee7f3771b1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 08:40:40 -0700
Subject: [PATCH 067/450] Allow default min/max ranges for int16 data types.

PiperOrigin-RevId: 193362891
---
 tensorflow/contrib/lite/toco/BUILD            |  1 +
 tensorflow/contrib/lite/toco/args.h           |  2 +
 .../graph_transformations.h                   | 18 ++++
 .../propagate_default_min_max.cc              | 86 +++++++++++++++++++
 .../contrib/lite/toco/toco_cmdline_flags.cc   | 16 +++-
 tensorflow/contrib/lite/toco/toco_flags.proto |  8 +-
 tensorflow/contrib/lite/toco/toco_tooling.cc  | 35 ++++++--
 tensorflow/contrib/lite/toco/tooling_util.cc  | 22 -----
 tensorflow/contrib/lite/toco/tooling_util.h   |  2 -
 9 files changed, 155 insertions(+), 35 deletions(-)
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/propagate_default_min_max.cc

diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index 398978b145..f696f4b845 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -238,6 +238,7 @@ cc_library(
         "graph_transformations/merge_reshape_into_preceding_transpose.cc",
         "graph_transformations/propagate_activation_function_into_constants.cc",
         "graph_transformations/propagate_array_data_types.cc",
+        "graph_transformations/propagate_default_min_max.cc",
         "graph_transformations/propagate_fake_quant_num_bits.cc",
         "graph_transformations/propagate_fixed_sizes.cc",
         "graph_transformations/quantization_util.cc",
diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
index 71e7318ac3..c9662d05ce 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -227,6 +227,8 @@ struct ParsedTocoFlags {
   // TODO(aselle): command_line_flags  doesn't support doubles
   Arg<float> default_ranges_min = Arg<float>(0.);
   Arg<float> default_ranges_max = Arg<float>(0.);
+  Arg<float> default_int16_ranges_min = Arg<float>(0.);
+  Arg<float> default_int16_ranges_max = Arg<float>(0.);
   Arg<string> inference_type;
   Arg<string> inference_input_type;
   Arg<bool> drop_fake_quant = Arg<bool>(false);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 56b3dec5c4..8075d0205d 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -190,6 +190,24 @@ DECLARE_GRAPH_TRANSFORMATION(Dequantize)
 DECLARE_GRAPH_TRANSFORMATION(UnpartitionEmbeddingLookup)
 DECLARE_GRAPH_TRANSFORMATION(ExperimentalShuffleFCWeights)
 
+class PropagateDefaultMinMax : public GraphTransformation {
+ public:
+  bool Run(Model* model, std::size_t op_index) override;
+  const char* Name() const override { return "PropagateDefaultMinMax"; }
+
+  bool has_any_ranges_defined() const { return !type_ranges_.empty(); }
+  void DefineTypeRange(ArrayDataType data_type, double min, double max) {
+    MinMax minmax;
+    minmax.min = min;
+    minmax.max = max;
+    type_ranges_.emplace_back(data_type, minmax);
+  }
+
+ private:
+  bool SetArrayMinMax(const string& array_name, Array* array);
+  std::vector<std::pair<ArrayDataType, MinMax>> type_ranges_;
+};
+
 class ResolveReshapeAttributes : public GraphTransformation {
  public:
   bool Run(Model* model, std::size_t op_index) override;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_default_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_default_min_max.cc
new file mode 100644
index 0000000000..50b90e7c2b
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_default_min_max.cc
@@ -0,0 +1,86 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// Propagates default min/max values to any operator input/output array that
+// is missing them.
+//
+// When provided a set of min/max values for uint8 arrays this will rescale
+// the values for other data types as required and preserving the floating point
+// range within the new type.
+bool PropagateDefaultMinMax::Run(Model* model, std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  const auto* op = it->get();
+
+  bool did_change = false;
+
+  for (const auto& input : op->inputs) {
+    auto& input_array = model->GetArray(input);
+    if (!input_array.minmax && !input_array.buffer) {
+      did_change |= SetArrayMinMax(input, &input_array);
+    }
+  }
+
+  for (const auto& output : op->outputs) {
+    auto& output_array = model->GetArray(output);
+    if (!output_array.minmax && !output_array.buffer) {
+      did_change |= SetArrayMinMax(output, &output_array);
+    }
+  }
+
+  return did_change;
+}
+
+// Sets the min/max on the given array, adjusting the reference_minmax for the
+// final data type of the array if it is already specified.
+bool PropagateDefaultMinMax::SetArrayMinMax(const string& array_name,
+                                            Array* array) {
+  CHECK(!array->minmax);
+
+  ArrayDataType quantized_data_type =
+      GetQuantizedDataType(*array, ArrayDataType::kUint8);
+  for (const auto& type_range : type_ranges_) {
+    if (type_range.first == quantized_data_type) {
+      array->GetOrCreateMinMax() = type_range.second;
+      break;
+    }
+  }
+  if (!array->minmax) {
+    AddMessageF(
+        "No defaults specified for quantized data type %s of array %s, "
+        "skipping",
+        ArrayDataTypeName(quantized_data_type), array_name);
+    return false;
+  }
+
+  AddMessageF("Adding default minmax %g,%g to array %s when quantized as %s",
+              array->GetMinMax().min, array->GetMinMax().max, array_name,
+              ArrayDataTypeName(quantized_data_type));
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
index d1d68b6b47..74f98c8452 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
@@ -61,11 +61,21 @@ bool ParseTocoFlagsFromCommandLineFlags(
       Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(),
            parsed_flags.default_ranges_min.default_value(),
            "If defined, will be used as the default value for the min bound "
-           "of min/max ranges used for quantization."),
+           "of min/max ranges used for quantization of uint8 arrays."),
       Flag("default_ranges_max", parsed_flags.default_ranges_max.bind(),
            parsed_flags.default_ranges_max.default_value(),
            "If defined, will be used as the default value for the max bound "
-           "of min/max ranges used for quantization."),
+           "of min/max ranges used for quantization of uint8 arrays."),
+      Flag("default_int16_ranges_min",
+           parsed_flags.default_int16_ranges_min.bind(),
+           parsed_flags.default_int16_ranges_min.default_value(),
+           "If defined, will be used as the default value for the min bound "
+           "of min/max ranges used for quantization of int16 arrays."),
+      Flag("default_int16_ranges_max",
+           parsed_flags.default_int16_ranges_max.bind(),
+           parsed_flags.default_int16_ranges_max.default_value(),
+           "If defined, will be used as the default value for the max bound "
+           "of min/max ranges used for quantization of int16 arrays."),
       Flag("inference_type", parsed_flags.inference_type.bind(),
            parsed_flags.inference_type.default_value(),
            "Target data type of arrays in the output file (for input_arrays, "
@@ -212,6 +222,8 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
   PARSE_TOCO_FLAG(IODataType, inference_input_type, FlagRequirement::kNone);
   READ_TOCO_FLAG(default_ranges_min, FlagRequirement::kNone);
   READ_TOCO_FLAG(default_ranges_max, FlagRequirement::kNone);
+  READ_TOCO_FLAG(default_int16_ranges_min, FlagRequirement::kNone);
+  READ_TOCO_FLAG(default_int16_ranges_max, FlagRequirement::kNone);
   READ_TOCO_FLAG(drop_fake_quant, FlagRequirement::kNone);
   READ_TOCO_FLAG(reorder_across_fake_quant, FlagRequirement::kNone);
   READ_TOCO_FLAG(allow_custom_ops, FlagRequirement::kNone);
diff --git a/tensorflow/contrib/lite/toco/toco_flags.proto b/tensorflow/contrib/lite/toco/toco_flags.proto
index 751aca948c..869c512d93 100644
--- a/tensorflow/contrib/lite/toco/toco_flags.proto
+++ b/tensorflow/contrib/lite/toco/toco_flags.proto
@@ -37,7 +37,7 @@ enum FileFormat {
 // of as properties of models, instead describing how models are to be
 // processed in the context of the present tooling job.
 //
-// Next ID to use: 15.
+// Next ID to use: 17.
 message TocoFlags {
   // Input file format
   optional FileFormat input_format = 1;
@@ -103,8 +103,14 @@ message TocoFlags {
   // for experimentation purposes only and should not be used in production:
   // they make it easy to quantize models, but the resulting quantized model
   // will be inaccurate.
+  //
+  // These values only apply to arrays quantized with the kUint8 data type.
   optional float default_ranges_min = 5;
   optional float default_ranges_max = 6;
+  // Equivalent versions of default_ranges_min/_max for arrays quantized with
+  // the kInt16 data type.
+  optional float default_int16_ranges_min = 15;
+  optional float default_int16_ranges_max = 16;
 
   // Ignore and discard FakeQuant nodes. For instance, that can be used to
   // generate plain float code without fake-quantization from a quantized
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index b69852453c..89cb2f85f8 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <set>
 
+#include "absl/memory/memory.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/contrib/lite/toco/allocate_transient_arrays.h"
 #include "tensorflow/contrib/lite/toco/dump_graphviz.h"
@@ -270,10 +271,6 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   RunGraphTransformations(model, "general graph transformations",
                           transformations);
 
-  // Fix any issues with IO edges. This must happen after any transform that
-  // may modify the structure of the edges.
-  FixEdgeArrays(model);
-
   if (quantize_output) {
     if (toco_flags.propagate_fake_quant_num_bits()) {
       RunGraphTransformations(model,
@@ -287,16 +284,38 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
                             });
   }
 
+  // Fix any issues with IO edges. This must happen after any transform that
+  // may modify the structure of the edges.
+  FixEdgeArrays(model);
+
   if (quantize_output) {
+    // If the user specified default min/max ranges we need to set all arrays
+    // that didn't either have a min/max specified or get one set via
+    // HardcodeMinMax or PropagateFakeQuantNumBits. This may require running
+    // HardcodeMinMax to move changes through the graph as we make changes.
+    auto propagate_default_min_max =
+        absl::make_unique<PropagateDefaultMinMax>();
     if (toco_flags.has_default_ranges_min() &&
         toco_flags.has_default_ranges_max()) {
-      UseDefaultMinMaxRangeValues(model, toco_flags.default_ranges_min(),
-                                  toco_flags.default_ranges_max());
-      // The new MinMax info may need to be propagated a bit.
+      propagate_default_min_max->DefineTypeRange(
+          ArrayDataType::kUint8, toco_flags.default_ranges_min(),
+          toco_flags.default_ranges_max());
+    }
+    if (toco_flags.has_default_int16_ranges_min() &&
+        toco_flags.has_default_int16_ranges_max()) {
+      propagate_default_min_max->DefineTypeRange(
+          ArrayDataType::kInt16, toco_flags.default_int16_ranges_min(),
+          toco_flags.default_int16_ranges_max());
+    }
+    if (propagate_default_min_max->has_any_ranges_defined()) {
       RunGraphTransformations(
           model, "default min-max range propagation graph transformations",
-          {new HardcodeMinMax});
+          {
+              propagate_default_min_max.release(),
+              new HardcodeMinMax,
+          });
     }
+
     CheckIsReadyForQuantization(*model);
     RunGraphTransformations(model, "quantization graph transformations",
                             {
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index ecac0c28a5..cf2cbeedc7 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -1474,28 +1474,6 @@ void CheckIsReadyForQuantization(const Model& model) {
   }
 }
 
-void UseDefaultMinMaxRangeValues(Model* model, double default_ranges_min,
-                                 double default_ranges_max) {
-  for (const auto& op : model->operators) {
-    for (const auto& input : op->inputs) {
-      auto& input_array = model->GetArray(input);
-      if (!input_array.minmax && !input_array.buffer) {
-        auto& minmax = input_array.GetOrCreateMinMax();
-        minmax.min = default_ranges_min;
-        minmax.max = default_ranges_max;
-      }
-    }
-    for (const auto& output : op->outputs) {
-      auto& output_array = model->GetArray(output);
-      if (!output_array.minmax && !output_array.buffer) {
-        auto& minmax = output_array.GetOrCreateMinMax();
-        minmax.min = default_ranges_min;
-        minmax.max = default_ranges_max;
-      }
-    }
-  }
-}
-
 int ElementSize(ArrayDataType data_type) {
   switch (data_type) {
     case ArrayDataType::kBool:
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index 4c705f4e5f..5cc15fa57b 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -188,8 +188,6 @@ T ConvertOperator(Operator* o, OperatorType type) {
 }
 
 void CheckIsReadyForQuantization(const Model& model);
-void UseDefaultMinMaxRangeValues(Model* model, double default_ranges_min,
-                                 double default_ranges_max);
 
 bool ReshapeIsEquivalentToTranspose(const Model& model,
                                     const TensorFlowReshapeOperator* op,
-- 
GitLab


From 5d8f98cdf0e4919e8558d661517c49960090a575 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Wed, 18 Apr 2018 23:49:52 +0800
Subject: [PATCH 068/450] Fix incorrect format in community/documentation.md

---
 .../docs_src/community/documentation.md       | 52 +++++++++----------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/tensorflow/docs_src/community/documentation.md b/tensorflow/docs_src/community/documentation.md
index d5bc7a5a7a..8639656d07 100644
--- a/tensorflow/docs_src/community/documentation.md
+++ b/tensorflow/docs_src/community/documentation.md
@@ -402,24 +402,24 @@ types and default values.
 
 For example:
 
-    ```c++
-    REGISTER_OP("PngDecode")
-      .Input("contents: string")
-      .Attr("channels: int = 0")
-      .Output("image: uint8")
-      .Doc(R"doc(
-    Decodes the contents of a PNG file into a uint8 tensor.
-
-    contents: PNG file contents.
-    channels: Number of color channels, or 0 to autodetect based on the input.
-      Must be 0 for autodetect, 1 for grayscale, 3 for RGB, or 4 for RGBA.
-      If the input has a different number of channels, it will be transformed
-      accordingly.
-    image:= A 3-D uint8 tensor of shape `[height, width, channels]`.
-      If `channels` is 0, the last dimension is determined
-      from the png contents.
-    )doc");
-    ```
+```c++
+REGISTER_OP("PngDecode")
+  .Input("contents: string")
+  .Attr("channels: int = 0")
+  .Output("image: uint8")
+  .Doc(R"doc(
+Decodes the contents of a PNG file into a uint8 tensor.
+
+contents: PNG file contents.
+channels: Number of color channels, or 0 to autodetect based on the input.
+  Must be 0 for autodetect, 1 for grayscale, 3 for RGB, or 4 for RGBA.
+  If the input has a different number of channels, it will be transformed
+  accordingly.
+image:= A 3-D uint8 tensor of shape `[height, width, channels]`.
+  If `channels` is 0, the last dimension is determined
+  from the png contents.
+)doc");
+```
 
 Results in this piece of Markdown:
 
@@ -429,12 +429,12 @@ Results in this piece of Markdown:
 
     #### Args:
 
-    *  <b>contents</b>: A string Tensor. PNG file contents.
-    *  <b>channels</b>: An optional int. Defaults to 0.
+    *  **contents**: A string Tensor. PNG file contents.
+    *  **channels**: An optional int. Defaults to 0.
        Number of color channels, or 0 to autodetect based on the input.
        Must be 0 for autodetect, 1 for grayscale, 3 for RGB, or 4 for RGBA.  If the
        input has a different number of channels, it will be transformed accordingly.
-    *  <b>name</b>: A name for the operation (optional).
+    *  **name**: A name for the operation (optional).
 
     #### Returns:
     A 3-D uint8 tensor of shape `[height, width, channels]`.  If `channels` is
@@ -442,7 +442,7 @@ Results in this piece of Markdown:
 
 Much of the argument description is added automatically. In particular, the doc
 generator automatically adds the name and type of all inputs, attrs, and
-outputs. In the above example, `<b>contents</b>: A string Tensor.` was added
+outputs. In the above example, `contents: A string Tensor.` was added
 automatically. You should write your additional text to flow naturally after
 that description.
 
@@ -664,10 +664,10 @@ This generates the following Args section in
 
     #### Args:
 
-    * <b>`contents`</b>: A `Tensor` of type `string`. 0-D.  The PNG-encoded
+    * **`contents`**: A `Tensor` of type `string`. 0-D.  The PNG-encoded
       image.
-    * <b>`channels`</b>: An optional `int`. Defaults to `0`. Number of color
+    * **`channels`**: An optional `int`. Defaults to `0`. Number of color
       channels for the decoded image.
-    * <b>`dtype`</b>: An optional `tf.DType` from: `tf.uint8,
+    * **`dtype`**: An optional `tf.DType` from: `tf.uint8,
       tf.uint16`. Defaults to `tf.uint 8`.
-    * <b>`name`</b>: A name for the operation (optional).
+    * **`name`**: A name for the operation (optional).
-- 
GitLab


From 5dd4bf753b8f708db69a7ab455a25fb0bb9821a5 Mon Sep 17 00:00:00 2001
From: Martin Wicke <577277+martinwicke@users.noreply.github.com>
Date: Tue, 17 Apr 2018 11:54:48 -0700
Subject: [PATCH 069/450] Merge pull request #18601 from
 yongtang/18598-tf.compat.as_str

Fix tf.compat.as_str returns bytes issue in Python 3
---
 tensorflow/python/util/compat.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 4163fcac79..738479c946 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -45,7 +45,6 @@ from tensorflow.python.util.tf_export import tf_export
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('compat.as_bytes', 'compat.as_str')
 def as_bytes(bytes_or_text, encoding='utf-8'):
   """Converts either bytes or unicode to `bytes`, using utf-8 encoding for text.
 
@@ -68,7 +67,6 @@ def as_bytes(bytes_or_text, encoding='utf-8'):
                     (bytes_or_text,))
 
 
-@tf_export('compat.as_text')
 def as_text(bytes_or_text, encoding='utf-8'):
   """Returns the given argument as a unicode string.
 
@@ -93,8 +91,12 @@ def as_text(bytes_or_text, encoding='utf-8'):
 # Convert an object to a `str` in both Python 2 and 3.
 if _six.PY2:
   as_str = as_bytes
+  tf_export('compat.as_bytes', 'compat.as_str')(as_bytes)
+  tf_export('compat.as_text')(as_text)
 else:
   as_str = as_text
+  tf_export('compat.as_bytes')(as_bytes)
+  tf_export('compat.as_text', 'compat.as_str')(as_text)
 
 
 @tf_export('compat.as_str_any')
-- 
GitLab


From fc1485183013b5e71cdc1b566e01083cbde8305f Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 18 Apr 2018 09:03:21 -0700
Subject: [PATCH 070/450] Avoid generating degenerate dimensions during gather
 expansions

This gets rid of two cases that used to introduce degenerate dimensions
(dimensions with bound = 1) into the while loop state:

 - Previously we'd explicitly reshape gathers using scalar indices to have a
   minor degenerate dimension.  With this CL we no longer do that - instead we
   push this into the code that looks up the index vector from the gather
   indices tensor.
 - Previously we'd have the accumulator (the tensor we're
   dynamic-update-slice-ing into) contain all of the degenerate window dims that
   the gather op would later elide (after the while loop).  With this CL we
   eagerly elide these dimensions as we slice out individual windows from the
   operand.

PiperOrigin-RevId: 193365863
---
 .../compiler/xla/service/gather_expander.cc   | 143 +++++++++---------
 .../xla/service/gather_expander_test.cc       |  57 +++++++
 2 files changed, 130 insertions(+), 70 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gather_expander.cc b/tensorflow/compiler/xla/service/gather_expander.cc
index 1239f56364..2d3e4b1fcd 100644
--- a/tensorflow/compiler/xla/service/gather_expander.cc
+++ b/tensorflow/compiler/xla/service/gather_expander.cc
@@ -28,9 +28,15 @@ using tensorflow::gtl::ArraySlice;
 static StatusOr<HloInstruction*> TransposeIndexVectorDimToLast(
     HloInstruction* gather_indices, int64 index_vector_dim) {
   const Shape& gather_indices_shape = gather_indices->shape();
+
+  if (gather_indices_shape.dimensions_size() == index_vector_dim) {
+    return gather_indices;
+  }
+
   if (index_vector_dim == (gather_indices_shape.dimensions_size() - 1)) {
     return gather_indices;
   }
+
   std::vector<int64> permutation;
   permutation.reserve(gather_indices_shape.dimensions_size());
   for (int64 i = 0, e = gather_indices_shape.dimensions_size(); i < e; i++) {
@@ -42,54 +48,35 @@ static StatusOr<HloInstruction*> TransposeIndexVectorDimToLast(
   return MakeTransposeHlo(gather_indices, permutation);
 }
 
-// If the gather_indices holds scalar indices (i.e. gather_indices has rank N
-// and index_vector_dim is N) then reshape it to have a trailing degenerate
-// dimension.  This makes the code for slicing out the index vector more
-// uniform.
-static StatusOr<HloInstruction*> DeScalarizeGatherIndices(
-    HloInstruction* gather_indices, int64 index_vector_dim) {
-  const Shape& gather_indices_shape = gather_indices->shape();
-  if (index_vector_dim != gather_indices_shape.dimensions_size()) {
-    return gather_indices;
-  }
-
-  DCHECK_EQ(index_vector_dim, gather_indices_shape.dimensions_size());
-
-  std::vector<int64> result_shape_dims;
-  c_copy(gather_indices_shape.dimensions(),
-         std::back_inserter(result_shape_dims));
-  result_shape_dims.push_back(1);
-
-  return MakeReshapeHlo(result_shape_dims, gather_indices);
-}
-
 // Canonicalizes the gather_indices tensors so that we only have deal with some
 // specific cases in the while loop that does the heavy lifting.
 //
 // See the "High Level Algorithm" section for a broader picture.
 static StatusOr<HloInstruction*> CanonicalizeGatherIndices(
     HloInstruction* gather_indices, int64 index_vector_dim) {
-  // If gather_indices holds scalar indices, normalize it to hold index vectors
-  // of size 1.
+  // Transpose the non-index-vector dimensions to the front.
   TF_ASSIGN_OR_RETURN(
-      HloInstruction * descalarized_gather_indices,
-      DeScalarizeGatherIndices(gather_indices, index_vector_dim));
+      HloInstruction * transposed_gather_indices,
+      TransposeIndexVectorDimToLast(gather_indices, index_vector_dim));
+  bool indices_are_scalar =
+      index_vector_dim == gather_indices->shape().dimensions_size();
 
-  // Transpose the non-index-vector dimensions to the front.
-  TF_ASSIGN_OR_RETURN(HloInstruction * transposed_gather_indices,
-                      TransposeIndexVectorDimToLast(descalarized_gather_indices,
-                                                    index_vector_dim));
+  // The number of dimensions in gather_indices that are index dimensions.
+  const int64 index_dims_in_gather_indices = indices_are_scalar ? 0 : 1;
 
   // If there is only one index (i.e. gather_indices has rank 1 and this gather
   // is really just a dynamic slice) add a leading degenerate dimension for
   // uniformity.  Otherwise create a "collapsed" leading dimension that subsumes
   // all of the non-index-vector dimensions.
   const Shape& shape = transposed_gather_indices->shape();
-  if (shape.dimensions_size() == 1) {
+  if (shape.dimensions_size() == index_dims_in_gather_indices) {
     return PrependDegenerateDims(transposed_gather_indices, 1);
   } else {
-    return CollapseFirstNDims(transposed_gather_indices,
-                              shape.dimensions_size() - 1);
+    // Collapse all but the dimensions (0 or 1) in gather_indices containing the
+    // index vectors.
+    return CollapseFirstNDims(
+        transposed_gather_indices,
+        shape.dimensions_size() - index_dims_in_gather_indices);
   }
 }
 
@@ -156,48 +143,73 @@ static StatusOr<HloInstruction*> ExpandIndexVectorIntoOperandSpace(
 static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
     const HloInstruction& gather, HloInstruction* induction_var,
     const std::vector<HloInstruction*>& incoming_loop_state) {
+  const GatherDimensionNumbers& dim_numbers = gather.gather_dimension_numbers();
   CHECK_EQ(incoming_loop_state.size(), 3);
   HloInstruction* const operand = incoming_loop_state[0];
   HloInstruction* const gather_indices = incoming_loop_state[1];
   HloInstruction* const output_accumulator = incoming_loop_state[2];
 
-  int64 index_vector_size = gather_indices->shape().dimensions(1);
+  bool has_scalar_indices = gather_indices->shape().dimensions_size() == 1;
+  CHECK_EQ(has_scalar_indices,
+           dim_numbers.index_vector_dim() ==
+               gather.operand(1)->shape().dimensions_size());
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * induction_var_as_vector,
       MakeBroadcastHlo(induction_var, /*broadcast_dimensions=*/{},
                        /*result_shape_bounds=*/{1}));
 
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * index_into_gather_indices,
-      PadVectorWithZeros(induction_var_as_vector,
-                         /*zeros_to_prepend=*/0, /*zeros_to_append=*/1));
-
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * index_vector_2d,
-      MakeDynamicSliceHlo(gather_indices, index_into_gather_indices,
-                          {1, index_vector_size}));
+  HloInstruction* index_vector;
 
-  TF_ASSIGN_OR_RETURN(HloInstruction * index_vector,
-                      ElideDegenerateDims(index_vector_2d, {0}));
+  if (has_scalar_indices) {
+    // In this case gather_indices has rank 1 and induction_var_as_vector (of
+    // shape {1}) is an index into this rank 1 tensor.
+    TF_ASSIGN_OR_RETURN(
+        index_vector,
+        MakeDynamicSliceHlo(gather_indices, induction_var_as_vector, {1}));
+  } else {
+    // In this case gather_indices has rank 2 and induction_var_as_vector (of
+    // shape {1}) is an index into just the first dimension of this rank 2
+    // tensor.
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * index_into_gather_indices,
+        PadVectorWithZeros(induction_var_as_vector,
+                           /*zeros_to_prepend=*/0, /*zeros_to_append=*/1));
+
+    int64 index_vector_size = gather_indices->shape().dimensions(1);
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * index_vector_2d,
+        MakeDynamicSliceHlo(gather_indices, index_into_gather_indices,
+                            {1, index_vector_size}));
+
+    TF_ASSIGN_OR_RETURN(index_vector,
+                        ElideDegenerateDims(index_vector_2d, {0}));
+  }
 
-  TF_ASSIGN_OR_RETURN(HloInstruction * gathered_slice_start,
-                      ExpandIndexVectorIntoOperandSpace(
-                          index_vector, gather.gather_dimension_numbers(),
-                          operand->shape().dimensions_size()));
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * gathered_slice_start,
+      ExpandIndexVectorIntoOperandSpace(index_vector, dim_numbers,
+                                        operand->shape().dimensions_size()));
 
   TF_ASSIGN_OR_RETURN(HloInstruction * gathered_slice,
                       MakeDynamicSliceHlo(operand, gathered_slice_start,
                                           gather.gather_window_bounds()));
 
-  TF_ASSIGN_OR_RETURN(HloInstruction * gathered_slice_for_update,
-                      PrependDegenerateDims(gathered_slice, 1));
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * gathered_slice_with_dims_elided,
+      ElideDegenerateDims(gathered_slice,
+                          AsInt64Slice(dim_numbers.elided_window_dims())));
+
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * gathered_slice_for_update,
+      PrependDegenerateDims(gathered_slice_with_dims_elided, 1));
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * index_vector_into_accumulator,
       PadVectorWithZeros(
           induction_var_as_vector, /*zeros_to_prepend=*/0,
-          /*zeros_to_append=*/gathered_slice->shape().dimensions_size()));
+          /*zeros_to_append=*/
+          gathered_slice_with_dims_elided->shape().dimensions_size()));
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * updated_accumulator,
@@ -213,26 +225,20 @@ static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
 
 static StatusOr<HloInstruction*> CreateGatherLoopAccumulatorInitValue(
     HloComputation* computation, PrimitiveType element_type,
-    ArraySlice<int64> window_bounds, int64 gather_loop_trip_count) {
+    ArraySlice<int64> window_bounds, int64 gather_loop_trip_count,
+    const GatherDimensionNumbers& dim_numbers) {
   std::vector<int64> accumulator_state_shape_dims;
   accumulator_state_shape_dims.reserve(1 + window_bounds.size());
   accumulator_state_shape_dims.push_back(gather_loop_trip_count);
-  c_copy(window_bounds, std::back_inserter(accumulator_state_shape_dims));
+  for (int64 i = 0; i < window_bounds.size(); i++) {
+    if (!c_binary_search(dim_numbers.elided_window_dims(), i)) {
+      accumulator_state_shape_dims.push_back(window_bounds[i]);
+    }
+  }
   return BroadcastZeros(computation, element_type,
                         accumulator_state_shape_dims);
 }
 
-static StatusOr<HloInstruction*> ElideWindowDimsFromAccumulator(
-    HloInstruction* accumulator, const GatherDimensionNumbers& dim_numbers) {
-  std::vector<int64> dims_to_elide;
-  dims_to_elide.reserve(dim_numbers.elided_window_dims_size());
-  for (int64 elided_window_dim : dim_numbers.elided_window_dims()) {
-    dims_to_elide.push_back(elided_window_dim + 1);
-  }
-
-  return ElideDegenerateDims(accumulator, dims_to_elide);
-}
-
 // `accumulator` is almost the tensor the gather operation would have produced,
 // except that it has the dimensions in the wrong order -- the gather dimensions
 // are the major dimensions and the window dimensions are the minor dimensions.
@@ -331,7 +337,8 @@ StatusOr<HloInstruction*> GatherExpander::ExpandGather(
       HloInstruction * accumulator_init,
       CreateGatherLoopAccumulatorInitValue(
           computation, output_shape.element_type(),
-          gather_instr->gather_window_bounds(), gather_loop_trip_count));
+          gather_instr->gather_window_bounds(), gather_loop_trip_count,
+          gather_instr->gather_dimension_numbers()));
 
   StatusOr<std::vector<HloInstruction*>> gather_loop_result_or_error =
       WhileUtil::MakeCountedLoop(
@@ -346,14 +353,10 @@ StatusOr<HloInstruction*> GatherExpander::ExpandGather(
                       gather_loop_result_or_error);
 
   HloInstruction* accumulator_result = gather_loop_result.back();
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * accumulator_with_window_dims_elided,
-      ElideWindowDimsFromAccumulator(accumulator_result, dim_numbers));
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * accumulator_with_output_gather_dims_decanonicalized,
-      AdjustGatherDimsInAccumulator(gather_indices->shape(),
-                                    accumulator_with_window_dims_elided,
+      AdjustGatherDimsInAccumulator(gather_indices->shape(), accumulator_result,
                                     dim_numbers.index_vector_dim()));
 
   return PermuteGatherAndWindowDims(
diff --git a/tensorflow/compiler/xla/service/gather_expander_test.cc b/tensorflow/compiler/xla/service/gather_expander_test.cc
index ba41ee8428..1c72ca0665 100644
--- a/tensorflow/compiler/xla/service/gather_expander_test.cc
+++ b/tensorflow/compiler/xla/service/gather_expander_test.cc
@@ -47,5 +47,62 @@ ENTRY main {
                            "indices are not supported."));
 }
 
+TEST(GatherExpanderTest, AvoidDegenerateDims) {
+  const string hlo_text = R"(
+HloModule TensorFlowGatherV2
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  ROOT gather = s32[3,2] gather(operand, indices),
+      output_window_dims={0},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=1,
+      window_bounds={3, 1}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, GatherExpander{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  HloInstruction* while_instr = nullptr;
+  for (auto* instr : module->entry_computation()->instructions()) {
+    if (instr->opcode() == HloOpcode::kWhile) {
+      ASSERT_EQ(while_instr, nullptr)
+          << "Expected exactly one while instruction in the entry computation "
+             "after gather expansion";
+      while_instr = instr;
+    }
+  }
+
+  ASSERT_NE(while_instr, nullptr)
+      << "Expected exactly one while instruction in the entry computation "
+         "after gather expansion";
+
+  // We want to avoid create while loop with shapes that have degenerate
+  // dimensions for TF gather.  In this case we expect the loop state to be of
+  // the shape (sNN[], s32[3,3]{1,0}, s32[2]{0}, s32[2,3]{1,0}).  The leading
+  // sNN is an implementation detail from WhileUtil::MakeCountedLoop so we don't
+  // check it here (though in theory the form of the while loop state is itself
+  // an implementation detail from WhileUtil::MakeCountedLoop).
+
+  const Shape& while_shape = while_instr->shape();
+  ASSERT_TRUE(ShapeUtil::IsTuple(while_shape));
+  ASSERT_EQ(ShapeUtil::TupleElementCount(while_shape), 4);
+
+  EXPECT_TRUE(ShapeUtil::SameDimensions(
+      ShapeUtil::MakeShape(S32, {3, 3}),
+      ShapeUtil::GetTupleElementShape(while_shape, 1)));
+
+  EXPECT_TRUE(ShapeUtil::SameDimensions(
+      ShapeUtil::MakeShape(S32, {2}),
+      ShapeUtil::GetTupleElementShape(while_shape, 2)));
+
+  EXPECT_TRUE(ShapeUtil::SameDimensions(
+      ShapeUtil::MakeShape(S32, {2, 3}),
+      ShapeUtil::GetTupleElementShape(while_shape, 3)));
+}
 }  // namespace
 }  // namespace xla
-- 
GitLab


From b234c288c1e3ec8f98ba99df738aa64b81659925 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 09:30:17 -0700
Subject: [PATCH 071/450] Updating some more tests in constant_folding_test.cc
 so that the tests evaluate the original and optimized graphs and check if
 their outputs are the same.

PiperOrigin-RevId: 193369280
---
 .../optimizers/constant_folding_test.cc       | 88 ++++++++++++++++++-
 1 file changed, 87 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 36625b68b7..4b41dae480 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -520,6 +520,25 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_UnknownOutputShape) {
       EXPECT_EQ("Mul", node.op()) << node.name();
     }
   }
+
+  const std::vector<string> fetch = {"mul_0", "mul_4", "mul_8"};
+  auto x_known_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto x_partially_unknown_t =
+      GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
+  auto x_unknown_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 7}));
+  auto expected_tensors =
+      EvaluateNodes(item.graph, fetch,
+                    {{"x_known", x_known_t},
+                     {"x_partially_unknown", x_partially_unknown_t},
+                     {"x_unknown", x_unknown_t}});
+  EXPECT_EQ(fetch.size(), expected_tensors.size());
+  auto tensors = EvaluateNodes(output, fetch,
+                               {{"x_known", x_known_t},
+                                {"x_partially_unknown", x_partially_unknown_t},
+                                {"x_unknown", x_unknown_t}});
+  EXPECT_EQ(fetch.size(), tensors.size());
+  for (int i = 0; i < tensors.size(); i++)
+    test::ExpectTensorNear<float>(expected_tensors[i], tensors[i], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
@@ -572,6 +591,20 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
       EXPECT_TRUE(IsControlInput(node.input(1)));
     }
   }
+  const std::vector<string> fetch = {"addn1"};
+  auto x_partially_unknown_t =
+      GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto x_unknown_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto expected_tensors =
+      EvaluateNodes(item.graph, fetch,
+                    {{"x_partially_unknown", x_partially_unknown_t},
+                     {"x_unknown", x_unknown_t}});
+  EXPECT_EQ(1, expected_tensors.size());
+  auto tensors = EvaluateNodes(output, fetch,
+                               {{"x_partially_unknown", x_partially_unknown_t},
+                                {"x_unknown", x_unknown_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(expected_tensors[0], tensors[0], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, CreateConstNodes) {
@@ -1056,6 +1089,20 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) {
     }
   }
   EXPECT_EQ(9, found);
+
+  auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
+  auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 6}));
+  auto v3_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 6}));
+  const std::vector<string> fetch_nodes = {"i1a", "i1b", "i2a", "i2b",
+                                           "i2c", "i3a", "i3b"};
+  auto tensors_expected = EvaluateNodes(
+      item.graph, fetch_nodes, {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
+  auto tensors = EvaluateNodes(output, fetch_nodes,
+                               {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors.size());
+  for (int i = 0; i < fetch_nodes.size(); i++)
+    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
 }
 
 TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN_MultipleOutputs) {
@@ -1888,6 +1935,14 @@ TEST_F(ConstantFoldingTest, Packing) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
+  const std::vector<string> fetch_nodes = {"i1", "i2"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes);
+  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
+  auto tensors = EvaluateNodes(output, fetch_nodes);
+  EXPECT_EQ(fetch_nodes.size(), tensors.size());
+  for (int i = 0; i < fetch_nodes.size(); i++)
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
+
   // Make sure that the representation of the folded constant is space
   // efficient: in particular, the whole message should be smaller than 8k
   // (the size needed to naively encode 1000 floats folded twice).
@@ -1923,6 +1978,13 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
+  std::vector<string> fetch_nodes = {"o1", "o2", "p1", "p2"};
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1, 5}));
+  auto g_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, fetch_nodes, {{"a", a_t}, {"g", g_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
+
   // Run a second time to make sure the optimization is idempotent.
   item.graph.Swap(&output);
   status = optimizer.Optimize(nullptr, item, &output);
@@ -1963,6 +2025,11 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
     }
   }
   EXPECT_EQ(6, found);
+
+  auto tensors = EvaluateNodes(output, fetch_nodes, {{"a", a_t}, {"g", g_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors.size());
+  for (int i = 0; i < fetch_nodes.size(); i++)
+    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
 }
 
 TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
@@ -1982,6 +2049,11 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  std::vector<string> fetch_nodes = {"o1", "o2"};
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes, {{"a", a_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
+
   ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -2036,6 +2108,10 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
     }
   }
   EXPECT_EQ(7, found);
+  auto tensors = EvaluateNodes(output, fetch_nodes, {{"a", a_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors.size());
+  for (int i = 0; i < fetch_nodes.size(); i++)
+    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
 }
 
 TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
@@ -2513,7 +2589,7 @@ TEST_F(ConstantFoldingTest, Enter) {
   value_tensor.AsProtoTensorContent(value.mutable_tensor());
 
   GraphDef& graph = item.graph;
-  AddNode("x", "Placeholder", {}, {{"T", type}}, &graph);
+  AddNode("x", "Placeholder", {}, {{"dtype", type}}, &graph);
   AddNode("c1", "Const", {"^x"}, {{"value", value}, {"dtype", type}}, &graph);
   AddNode("enter1", "Enter", {"x"},
           {{"T", type},
@@ -2539,6 +2615,10 @@ TEST_F(ConstantFoldingTest, Enter) {
   item.fetch.push_back("id3");
   item.fetch.push_back("id4");
 
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 7}));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {{"x", x_t}});
+  EXPECT_EQ(item.fetch.size(), tensors_expected.size());
+
   ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -2566,6 +2646,12 @@ TEST_F(ConstantFoldingTest, Enter) {
       EXPECT_EQ("enter3", node.input(0));
     }
   }
+
+  auto tensors = EvaluateNodes(output, item.fetch, {{"x", x_t}});
+  EXPECT_EQ(item.fetch.size(), tensors.size());
+
+  for (int i = 0; i < item.fetch.size(); i++)
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, TensorArraySize) {
-- 
GitLab


From 857ee499e35d94a61ca4c90a6f6a20bc9dee80c4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 09:40:21 -0700
Subject: [PATCH 072/450] Simplify the break canonicalization transformer to
 use more of the base transformer helpers. Add support for the loop's else
 block.

PiperOrigin-RevId: 193370640
---
 .../autograph/converters/break_statements.py  | 92 +++++++++++------
 .../converters/break_statements_test.py       | 99 +++++++++++++------
 .../contrib/autograph/pyct/transformer.py     |  4 +
 3 files changed, 132 insertions(+), 63 deletions(-)

diff --git a/tensorflow/contrib/autograph/converters/break_statements.py b/tensorflow/contrib/autograph/converters/break_statements.py
index 5dfb7a59d5..91de82f0a7 100644
--- a/tensorflow/contrib/autograph/converters/break_statements.py
+++ b/tensorflow/contrib/autograph/converters/break_statements.py
@@ -24,72 +24,102 @@ from tensorflow.contrib.autograph.pyct import transformer
 from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 
-class BreakCanonicalizationTransformer(transformer.Base):
+# Tags for local state.
+BREAK_USED = 'break_used'
+CONTROL_VAR_NAME = 'control_var_name'
+
+
+class BreakStatementTransformer(transformer.Base):
   """Canonicalizes break statements into additional conditionals."""
 
-  def __init__(self, context):
-    super(BreakCanonicalizationTransformer, self).__init__(context)
-    # This is a stack structure, to correctly process nested loops.
-    # Each item is a list [break_used, break_variable_name]
-    self.break_uses = []
+  def _track_body(self, nodes, break_var):
+    self.enter_local_scope()
+    self.set_local(CONTROL_VAR_NAME, break_var)
+    nodes = self.visit_block(nodes)
+    break_used = self.get_local(BREAK_USED, False)
+    self.exit_local_scope()
+    return nodes, break_used
 
   def visit_Break(self, node):
-    self.break_uses[-1][0] = True
+    self.set_local(BREAK_USED, True)
+    var_name = self.get_local(CONTROL_VAR_NAME)
+    # TODO(mdan): This will fail when expanded inside a top-level else block.
     template = """
       var_name = True
       continue
     """
-    return templates.replace(template, var_name=self.break_uses[-1][1])
+    return templates.replace(template, var_name=var_name)
+
+  def _guard_if_present(self, block, var_name):
+    """Prevents the block from executing if var_name is set."""
+    if not block:
+      return block
+    template = """
+        if not var_name:
+          block
+      """
+    node = templates.replace(
+        template,
+        var_name=var_name,
+        block=block)
+    return node
 
   def visit_While(self, node):
     scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-    break_var = self.context.namer.new_symbol('break_requested',
-                                              scope.referenced)
+    break_var = self.context.namer.new_symbol('break__', scope.referenced)
 
-    self.break_uses.append([False, break_var])
-    node = self.generic_visit(node)
-    if self.break_uses[-1][0]:
+    node.test = self.visit(node.test)
+    node.body, break_used = self._track_body(node.body, break_var)
+    # A break in the else clause applies to the containing scope.
+    node.orelse = self.visit_block(node.orelse)
+
+    if break_used:
       template = """
         var_name = False
-        while original_test and not var_name:
-          original_body
+        while test and not var_name:
+          body
         else:
-          original_orelse
+          orelse
       """
+      # Python's else clause only triggers if the loop exited cleanly (e.g.
+      # break did not trigger).
       node = templates.replace(
           template,
           var_name=break_var,
-          original_test=node.test,
-          original_body=node.body,
-          original_orelse=node.orelse)
-    self.break_uses.pop()
+          test=node.test,
+          body=node.body,
+          orelse=self._guard_if_present(node.orelse, break_var))
 
     return node
 
   def visit_For(self, node):
     scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-    break_var = self.context.namer.new_symbol('break_requested',
-                                              scope.referenced)
+    break_var = self.context.namer.new_symbol('break__', scope.referenced)
+
+    node.target = self.visit(node.target)
+    node.iter = self.visit(node.iter)
+    node.body, break_used = self._track_body(node.body, break_var)
+    # A break in the else clause applies to the containing scope.
+    node.orelse = self.visit_block(node.orelse)
 
-    self.break_uses.append([False, break_var])
-    node = self.generic_visit(node)
-    if self.break_uses[-1][0]:
+    if break_used:
+      node.orelse = self._guard_if_present(node.orelse, break_var)
       template = """
         var_name = False
-        original_for
+        for_stmt
       """
+      # Python's else clause only triggers if the loop exited cleanly (e.g.
+      # break did not trigger).
       node = templates.replace(
           template,
           var_name=break_var,
-          original_for=node)
+          for_stmt=node)
       extra_cond = templates.replace_as_expression(
           'not var_name', var_name=break_var)
-      new_for_node = node[1]
-      anno.setanno(new_for_node, 'extra_cond', extra_cond)
-    self.break_uses.pop()
+      anno.setanno(node[1], 'extra_cond', extra_cond)
 
     return node
 
 
 def transform(node, context):
-  return BreakCanonicalizationTransformer(context).visit(node)
+  return BreakStatementTransformer(context).visit(node)
diff --git a/tensorflow/contrib/autograph/converters/break_statements_test.py b/tensorflow/contrib/autograph/converters/break_statements_test.py
index dd4914a022..1af59e9b52 100644
--- a/tensorflow/contrib/autograph/converters/break_statements_test.py
+++ b/tensorflow/contrib/autograph/converters/break_statements_test.py
@@ -25,7 +25,7 @@ from tensorflow.python.platform import test
 
 class BreakCanonicalizationTest(converter_test_base.TestCase):
 
-  def test_basic_break(self):
+  def test_basic_while(self):
 
     def test_fn(x):
       v = []
@@ -40,13 +40,11 @@ class BreakCanonicalizationTest(converter_test_base.TestCase):
     node = break_statements.transform(node, self.ctx)
 
     with self.compiled(node) as result:
-      self.assertEqual(test_fn(0), result.test_fn(0))
-      self.assertEqual(test_fn(1), result.test_fn(1))
-      self.assertEqual(test_fn(2), result.test_fn(2))
-      self.assertEqual(test_fn(3), result.test_fn(3))
-      self.assertEqual(test_fn(4), result.test_fn(4))
+      self.assertEqual([], result.test_fn(0))
+      self.assertEqual([], result.test_fn(1))
+      self.assertEqual([3], result.test_fn(4))
 
-  def test_basic_break_for_loop(self):
+  def test_basic_for(self):
 
     def test_fn(a):
       v = []
@@ -57,30 +55,18 @@ class BreakCanonicalizationTest(converter_test_base.TestCase):
         v.append(x)
       return v
 
-    # The break is incompletely canonicalized for for loops. Everything is
-    # in place except for the condition verification.
-    def test_equiv_fn(a):
-      v = []
-      for x in a:
-        x -= 1
-        if x % 2 == 0:
-          continue
-        v.append(x)
-      return v
-
     node = self.parse_and_analyze(test_fn, {})
     node = break_statements.transform(node, self.ctx)
 
     with self.compiled(node) as result:
-      # The break is incompletely canonicalized. Everything is in place, but
-      # the loop does not break.
-      self.assertEqual(test_equiv_fn([]), result.test_fn([]))
-      self.assertEqual(test_equiv_fn([1]), result.test_fn([1]))
-      self.assertEqual(test_equiv_fn([2]), result.test_fn([2]))
-      self.assertEqual(
-          test_equiv_fn([1, 2, 3, 4]), result.test_fn([1, 2, 3, 4]))
+      # The break is incompletely canonicalized. The loop will not interrupt,
+      # but the section following the break will be skipped.
+      self.assertEqual([], result.test_fn([]))
+      self.assertEqual([3, 3], result.test_fn([4, 4]))
+      self.assertEqual([3], result.test_fn([4, 5]))
+      self.assertEqual([3], result.test_fn([5, 4]))
 
-  def test_continue_deeply_nested(self):
+  def test_deeply_nested(self):
 
     def test_fn(x):
       v = []
@@ -93,7 +79,7 @@ class BreakCanonicalizationTest(converter_test_base.TestCase):
             u.append(x)
           else:
             w.append(x)
-            continue
+            break
         v.append(x)
       return v, u, w
 
@@ -101,11 +87,60 @@ class BreakCanonicalizationTest(converter_test_base.TestCase):
     node = break_statements.transform(node, self.ctx)
 
     with self.compiled(node) as result:
-      self.assertEqual(test_fn(0), result.test_fn(0))
-      self.assertEqual(test_fn(1), result.test_fn(1))
-      self.assertEqual(test_fn(2), result.test_fn(2))
-      self.assertEqual(test_fn(3), result.test_fn(3))
-      self.assertEqual(test_fn(4), result.test_fn(4))
+      self.assertEqual(([], [], []), result.test_fn(0))
+      self.assertEqual(([2, 1], [2], [0]), result.test_fn(3))
+      self.assertEqual(([10, 9, 8, 7], [10, 8], [6]), result.test_fn(11))
+
+  def test_nested_loops(self):
+
+    def test_fn(x):
+      v = []
+      u = []
+      while x > 0:
+        x -= 1
+        y = x
+        while y > 0:
+          y -= 1
+          if y % 2 == 0:
+            break
+          u.append(y)
+        if x == 0:
+          break
+        v.append(x)
+      return v, u
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = break_statements.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      self.assertEqual(([], []), result.test_fn(0))
+      self.assertEqual(([1], []), result.test_fn(2))
+      self.assertEqual(([2, 1], [1]), result.test_fn(3))
+      self.assertEqual(([4, 3, 2, 1], [3, 1]), result.test_fn(5))
+
+  def test_loop_else(self):
+
+    def test_fn(x):
+      v = []
+      u = []
+      while x > 0:
+        x -= 1
+        y = x
+        while y > 1:
+          break
+        else:
+          u.append(y)
+          break
+        v.append(x)
+      return v, u
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = break_statements.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      self.assertEqual(([], []), result.test_fn(0))
+      self.assertEqual(([], [1]), result.test_fn(2))
+      self.assertEqual(([2], [1]), result.test_fn(3))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/autograph/pyct/transformer.py b/tensorflow/contrib/autograph/pyct/transformer.py
index e102ab7630..4db6cc0adf 100644
--- a/tensorflow/contrib/autograph/pyct/transformer.py
+++ b/tensorflow/contrib/autograph/pyct/transformer.py
@@ -69,6 +69,10 @@ class Base(gast.NodeTransformer):
   def enclosing_entities(self):
     return tuple(self._enclosing_entities)
 
+  @property
+  def locel_scope_level(self):
+    return len(self._local_scope_state)
+
   def enter_local_scope(self):
     self._local_scope_state.append({})
 
-- 
GitLab


From 910b77c46ce58a36964e30a1590d8037013d0782 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 10:27:48 -0700
Subject: [PATCH 073/450] Update ops-related pbtxt files.

PiperOrigin-RevId: 193378087
---
 tensorflow/core/ops/ops.pbtxt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 1659adc9fe..a36608ded3 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -4021,6 +4021,10 @@ op {
     name: "tree_complexity"
     type: DT_FLOAT
   }
+  input_arg {
+    name: "min_node_weight"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "node_ids_list"
     type: DT_INT32
-- 
GitLab


From ce7a92a62a6bbf0765e68a3340fe3efb07ac1e2b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 10:28:47 -0700
Subject: [PATCH 074/450] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 193378249

---
 tensorflow/go/op/wrappers.go | 3049 ++++++++++++++++++++++++++++++++--
 1 file changed, 2918 insertions(+), 131 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 1d5ebf6687..1d4b1399ed 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -43,7 +43,7 @@ type FakeQuantWithMinMaxVarsPerChannelGradientAttr func(optionalAttr)
 
 // FakeQuantWithMinMaxVarsPerChannelGradientNumBits sets the optional num_bits attribute to value.
 //
-// value: The bitwidth of the quantization; between 2 and 8, inclusive.
+// value: The bitwidth of the quantization; between 2 and 16, inclusive.
 // If not specified, defaults to 8
 func FakeQuantWithMinMaxVarsPerChannelGradientNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
 	return func(m optionalAttr) {
@@ -124,7 +124,7 @@ func FakeQuantWithMinMaxVarsPerChannelNarrowRange(value bool) FakeQuantWithMinMa
 // `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
 // when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
 // then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
 //
 // This operation has a gradient and thus allows for training `min` and `max`
 // values.
@@ -305,7 +305,7 @@ func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr
 // `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
 // when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
 // then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
 //
 // Quantization is called fake since the output is still in floating point.
 func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
@@ -401,6 +401,9 @@ func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQua
 //      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
 //      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
 //
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
 // Arguments:
 //	indices: Index tensor.
 //	updates: Updates to scatter into output.
@@ -1845,6 +1848,93 @@ func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_d
 	return op.Output(0)
 }
 
+// UniqueWithCountsV2Attr is an optional argument to UniqueWithCountsV2.
+type UniqueWithCountsV2Attr func(optionalAttr)
+
+// UniqueWithCountsV2OutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueWithCountsV2OutIdx(value tf.DataType) UniqueWithCountsV2Attr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements along an axis of a tensor.
+//
+// This operation either returns a tensor `y` containing unique elements
+// along the `axis` of a tensor. The returned unique elements is sorted
+// in the same order as they occur along `axis` in `x`.
+// This operation also returns a tensor `idx` and a tensor `count`
+// that are the same size as the number of the elements in `x` along the
+// `axis` dimension. The `idx` contains the index in the unique output `y`
+// and the `count` contains the count in the unique output `y`.
+// In other words, for an `1-D` tensor `x` with `axis = None:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// For example:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx, count = unique_with_counts(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// count ==> [2, 1, 3, 1, 2]
+// ```
+//
+// For an `2-D` tensor `x` with `axis = 0`:
+//
+// ```
+// # tensor 'x' is [[1, 0, 0],
+// #                [1, 0, 0],
+// #                [2, 0, 0]]
+// y, idx, count = unique_with_counts(x, axis=0)
+// y ==> [[1, 0, 0],
+//        [2, 0, 0]]
+// idx ==> [0, 0, 1]
+// count ==> [2, 1]
+// ```
+//
+// For an `2-D` tensor `x` with `axis = 1`:
+//
+// ```
+// # tensor 'x' is [[1, 0, 0],
+// #                [1, 0, 0],
+// #                [2, 0, 0]]
+// y, idx, count = unique_with_counts(x, axis=1)
+// y ==> [[1, 0],
+//        [1, 0],
+//        [2, 0]]
+// idx ==> [0, 1, 1]
+// count ==> [1, 2]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`.
+//	axis: A `Tensor` of type `int32` (default: None). The axis of the Tensor to
+// find the unique elements.
+//
+// Returns A `Tensor`. Unique elements along the `axis` of `Tensor` x.A 1-D Tensor. Has the same type as x that contains the index of each
+// value of x in the output y.A 1-D Tensor. The count of each value of x in the output y.
+func UniqueWithCountsV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueWithCountsV2Attr) (y tf.Output, idx tf.Output, count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UniqueWithCountsV2",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
 type UniqueWithCountsAttr func(optionalAttr)
 
@@ -1910,12 +2000,15 @@ func UniqueV2OutIdx(value tf.DataType) UniqueV2Attr {
 	}
 }
 
-// Finds unique elements in a 1-D tensor.
+// Finds unique elements along an axis of a tensor.
 //
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. In other words:
+// This operation either returns a tensor `y` containing unique elements
+// along the `axis` of a tensor. The returned unique elements is sorted
+// in the same order as they occur along `axis` in `x`.
+// This operation also returns a tensor `idx` that is the same size as
+// the number of the elements in `x` along the `axis` dimension. It
+// contains the index in the unique output `y`.
+// In other words, for an `1-D` tensor `x` with `axis = None:
 //
 // `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
 //
@@ -1928,9 +2021,34 @@ func UniqueV2OutIdx(value tf.DataType) UniqueV2Attr {
 // idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
 // ```
 //
+// For an `2-D` tensor `x` with `axis = 0`:
+//
+// ```
+// # tensor 'x' is [[1, 0, 0],
+// #                [1, 0, 0],
+// #                [2, 0, 0]]
+// y, idx = unique(x, axis=0)
+// y ==> [[1, 0, 0],
+//        [2, 0, 0]]
+// idx ==> [0, 0, 1]
+// ```
+//
+// For an `2-D` tensor `x` with `axis = 1`:
+//
+// ```
+// # tensor 'x' is [[1, 0, 0],
+// #                [1, 0, 0],
+// #                [2, 0, 0]]
+// y, idx = unique(x, axis=1)
+// y ==> [[1, 0],
+//        [1, 0],
+//        [2, 0]]
+// idx ==> [0, 1, 1]
+// ```
+//
 // Arguments:
 //	x: A `Tensor`.
-//	axis: A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
+//	axis: A `Tensor` of type `int32` (default: None). The axis of the Tensor to
 // find the unique elements.
 //
 // Returns A `Tensor`. Unique elements along the `axis` of `Tensor` x.A 1-D Tensor. Has the same type as x that contains the index of each
@@ -2217,6 +2335,35 @@ func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output,
 	return scope.AddOperation(opspec)
 }
 
+// Clips tensor values to a specified min and max.
+//
+// Given a tensor `t`, this operation returns a tensor of the same type and
+// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+// greater than `clip_value_max` are set to `clip_value_max`.
+//
+// Arguments:
+//	t: A `Tensor`.
+//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The minimum value to clip by.
+//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The maximum value to clip by.
+//
+// Returns A clipped `Tensor` with the same shape as input 't'.
+func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ClipByValue",
+		Input: []tf.Input{
+			t, clip_value_min, clip_value_max,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a sequence of numbers.
 //
 // This operation creates a sequence of numbers that begins at `start` and
@@ -2277,7 +2424,7 @@ func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, seg
 
 // Computes the mean along sparse segments of a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
@@ -2332,7 +2479,7 @@ func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.
 // Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
 // misisng, the `output` tensor at that position will be zeroed.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // For example:
@@ -2507,7 +2654,7 @@ func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Outpu
 
 // Computes the sum along sparse segments of a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
@@ -2572,6 +2719,44 @@ func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Computes the minimum along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the minimum such that:
+//
+// \\(output_i = \min_j data_j\\) where min is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the minimum is empty for a given segment ID `i`, it outputs the largest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::max()`.
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.
+//
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func UnsortedSegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentMin",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes rectified linear 6: `min(max(features, 0), 6)`.
 func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
@@ -2589,7 +2774,7 @@ func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
 
 // Computes the sum along segments of a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Computes a tensor such that
@@ -2920,6 +3105,32 @@ func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output)
 	return op.Output(0)
 }
 
+// Creates a dataset that passes a sliding window over `input_dataset`.
+//
+// Arguments:
+//
+//	window_size: A scalar representing the number of elements in the
+// sliding window.
+//	stride: A scalar representing the steps moving the sliding window
+// forward in one iteration. It must be in `[1, window_size)`.
+//
+//
+func SlideDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "SlideDataset",
+		Input: []tf.Input{
+			input_dataset, window_size, stride,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
 // N is the size of the segment being reduced.
@@ -2927,7 +3138,7 @@ func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output)
 // Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
 // misisng, the `output` tensor at that position will be zeroed.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Arguments:
@@ -3233,20 +3444,21 @@ func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthEleme
 	return op.Output(0)
 }
 
-// Computes the Max along segments of a tensor.
+// Computes the maximum along segments of a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
-// This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the maximum
-// such that:
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the maximum such that:
 //
 // \\(output_i = \max_j data_j\\) where max is over `j` such
 // that `segment_ids[j] == i`.
 //
-// If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
-//  `output[i] = numeric_limits<T>::min()`.
+// If the maximum is empty for a given segment ID `i`, it outputs the smallest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::lowest()`.
 //
 // <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 // <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
@@ -3656,7 +3868,7 @@ func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output)
 // Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
 // misisng, the `output` tensor at that position will be zeroed.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Arguments:
@@ -3758,9 +3970,8 @@ type ResizeBicubicAttr func(optionalAttr)
 
 // ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
 // If not specified, defaults to false
 func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
 	return func(m optionalAttr) {
@@ -4171,6 +4382,26 @@ func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Checks whether a tree ensemble has been initialized.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble resouce.
+//
+// Returns output boolean on whether it is initialized or not.
+func IsBoostedTreesEnsembleInitialized(scope *Scope, tree_ensemble_handle tf.Output) (is_initialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsBoostedTreesEnsembleInitialized",
+		Input: []tf.Input{
+			tree_ensemble_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Cast x of type SrcT to y of DstT.
 func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
 	if scope.Err() != nil {
@@ -4845,6 +5076,23 @@ func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backpr
 	return op.Output(0)
 }
 
+// Provides the time since epoch in seconds.
+//
+// Returns the timestamp as a `float64` for seconds since the Unix epoch.
+//
+// Note: the timestamp is computed when the op is executed, not when it is added
+// to the graph.
+func Timestamp(scope *Scope) (ts tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Timestamp",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // BatchMatMulAttr is an optional argument to BatchMatMul.
 type BatchMatMulAttr func(optionalAttr)
 
@@ -5315,6 +5563,51 @@ func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax
 	return op.Output(0)
 }
 
+// MutexV2Attr is an optional argument to MutexV2.
+type MutexV2Attr func(optionalAttr)
+
+// MutexV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this variable is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutexV2Container(value string) MutexV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutexV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this variable is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func MutexV2SharedName(value string) MutexV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a Mutex resource that can be locked by `MutexLock`.
+//
+// Returns The mutex resource.
+func MutexV2(scope *Scope, optional ...MutexV2Attr) (resource tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MutexV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AvgPool3DAttr is an optional argument to AvgPool3D.
 type AvgPool3DAttr func(optionalAttr)
 
@@ -7069,6 +7362,44 @@ func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Broadcasts a tensor value to one or more other devices.
+func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "CollectiveBcastSend",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Makes a copy of `x`.
+//
+// Arguments:
+//	x: The source tensor of type `T`.
+//
+// Returns     y: A `Tensor` of type `T`. A copy of `x`. Guaranteed that `y`
+//       is not an alias of `x`.
+func DeepCopy(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DeepCopy",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Split a `SparseTensor` into `num_split` tensors along one dimension.
 //
 // If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
@@ -7342,6 +7673,46 @@ func Tan(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Updates the tree ensemble by either adding a layer to the last tree being grown
+//
+// or by starting a new tree.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the ensemble variable.
+//	feature_ids: Rank 1 tensor with ids for each feature. This is the real id of
+// the feature that will be used in the split.
+//	node_ids: List of rank 1 tensors representing the nodes for which this feature
+// has a split.
+//	gains: List of rank 1 tensors representing the gains for each of the feature's
+// split.
+//	thresholds: List of rank 1 tensors representing the thesholds for each of the
+// feature's split.
+//	left_node_contribs: List of rank 2 tensors with left leaf contribs for each of
+// the feature's splits. Will be added to the previous node values to constitute
+// the values of the left nodes.
+//	right_node_contribs: List of rank 2 tensors with right leaf contribs for each
+// of the feature's splits. Will be added to the previous node values to constitute
+// the values of the right nodes.
+//	max_depth: Max depth of the tree to build.
+//	learning_rate: shrinkage const for each new tree.
+//	pruning_mode: 0-No pruning, 1-Pre-pruning, 2-Post-pruning.
+//
+// Returns the created operation.
+func BoostedTreesUpdateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, feature_ids tf.Output, node_ids []tf.Output, gains []tf.Output, thresholds []tf.Output, left_node_contribs []tf.Output, right_node_contribs []tf.Output, max_depth tf.Output, learning_rate tf.Output, pruning_mode int64) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pruning_mode": pruning_mode}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesUpdateEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle, feature_ids, tf.OutputList(node_ids), tf.OutputList(gains), tf.OutputList(thresholds), tf.OutputList(left_node_contribs), tf.OutputList(right_node_contribs), max_depth, learning_rate,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
 type ResourceSparseApplyFtrlAttr func(optionalAttr)
 
@@ -7419,7 +7790,7 @@ func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
 //
 // N is the size of the segment being reduced.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Arguments:
@@ -7652,24 +8023,65 @@ func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAt
 	return op.Output(0)
 }
 
-// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
-type StatelessRandomNormalAttr func(optionalAttr)
-
-// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+// Divides sparse updates into the variable referenced by `resource`.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom values from a normal distribution.
+// This operation computes
 //
-// The generated values will have mean 0 and standard deviation 1.
+//     # Scalar indices
+//     ref[indices, ...] /= updates[...]
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] /= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions multiply.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterDiv",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
+type StatelessRandomNormalAttr func(optionalAttr)
+
+// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
 //	shape: The shape of the output tensor.
@@ -7695,6 +8107,47 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 	return op.Output(0)
 }
 
+// Reduces sparse updates into the variable referenced by `resource` using the `min` operation.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = min(ref[indices, ...], updates[...])
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions are combined.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMin",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Reshapes a quantized tensor as per the Reshape op.
 //
 // ```
@@ -7904,9 +8357,8 @@ type ResizeBilinearAttr func(optionalAttr)
 
 // ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
 // If not specified, defaults to false
 func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
 	return func(m optionalAttr) {
@@ -7959,6 +8411,26 @@ func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
 	return op.Output(0)
 }
 
+// Creates a TensorList which, when stacked, has the value of `tensor`.
+//
+// Each tensor in the result list corresponds to one row of the input tensor.
+//
+// tensor: The input tensor.
+// output_handle: The list.
+func TensorListFromTensor(scope *Scope, tensor tf.Output, element_shape tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListFromTensor",
+		Input: []tf.Input{
+			tensor, element_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
 type GenerateVocabRemappingAttr func(optionalAttr)
 
@@ -8066,6 +8538,30 @@ func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output,
 	return scope.AddOperation(opspec)
 }
 
+// Creates and returns an empty tensor list.
+//
+// All list elements must be tensors of dtype element_dtype and shape compatible
+// with element_shape.
+//
+// handle: an empty tensor list.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func EmptyTensorList(scope *Scope, element_shape tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "EmptyTensorList",
+		Input: []tf.Input{
+			element_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AvgPoolGradAttr is an optional argument to AvgPoolGrad.
 type AvgPoolGradAttr func(optionalAttr)
 
@@ -8547,6 +9043,49 @@ func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64,
 	return op.Output(0)
 }
 
+// RegexReplaceAttr is an optional argument to RegexReplace.
+type RegexReplaceAttr func(optionalAttr)
+
+// RegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
+//
+// value: If True, the replacement is global, otherwise the replacement
+// is done only on the first match.
+// If not specified, defaults to true
+func RegexReplaceReplaceGlobal(value bool) RegexReplaceAttr {
+	return func(m optionalAttr) {
+		m["replace_global"] = value
+	}
+}
+
+// Replaces the match of pattern in input with rewrite.
+//
+// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+//
+// Arguments:
+//	input: The text to be processed.
+//	pattern: The regular expression to match the input.
+//	rewrite: The rewrite to be applied to the matched expresion.
+//
+// Returns The text after applying pattern and rewrite.
+func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.Output, optional ...RegexReplaceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RegexReplace",
+		Input: []tf.Input{
+			input, pattern, rewrite,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes numerical negative value element-wise.
 //
 // I.e., \\(y = -x\\).
@@ -8745,7 +9284,7 @@ func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr
 // `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
 // when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
 // then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
 //
 // This operation has a gradient and thus allows for training `min` and `max`
 // values.
@@ -9039,9 +9578,70 @@ func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Out
 	return scope.AddOperation(opspec)
 }
 
+// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
+//
+// is alive, any other request to use `MutexLock` with this mutex will wait.
+//
+// This is particularly useful for creating a critical section when used in
+// conjunction with `MutexLockIdentity`:
+//
+// ```python
+//
+// mutex = mutex_v2(
+//   shared_name=handle_name, container=container, name=name)
+//
+// def execute_in_critical_section(fn, *args, **kwargs):
+//   lock = gen_resource_variable_ops.mutex_lock(mutex)
+//
+//   with ops.control_dependencies([lock]):
+//     r = fn(*args, **kwargs)
+//
+//   with ops.control_dependencies(nest.flatten(r)):
+//     with ops.colocate_with(mutex):
+//       ensure_lock_exists = mutex_lock_identity(lock)
+//
+//     # Make sure that if any element of r is accessed, all of
+//     # them are executed together.
+//     r = nest.map_structure(tf.identity, r)
+//
+//   with ops.control_dependencies([ensure_lock_exists]):
+//     return nest.map_structure(tf.identity, r)
+// ```
+//
+// While `fn` is running in the critical section, no other functions which wish to
+// use this critical section may run.
+//
+// Often the use case is that two executions of the same graph, in parallel,
+// wish to run `fn`; and we wish to ensure that only one of them executes
+// at a time.  This is especially important if `fn` modifies one or more
+// variables at a time.
+//
+// It is also useful if two separate functions must share a resource, but we
+// wish to ensure the usage is exclusive.
+//
+// Arguments:
+//	mutex: The mutex resource to lock.
+//
+// Returns A tensor that keeps a shared pointer to a lock on the mutex;
+// when the Tensor is destroyed, the use count on the shared pointer is decreased
+// by 1.  When it reaches 0, the lock is released.
+func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MutexLock",
+		Input: []tf.Input{
+			mutex,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the mean along segments of a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Computes a tensor such that
@@ -9453,9 +10053,8 @@ type ResizeAreaAttr func(optionalAttr)
 
 // ResizeAreaAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
 // If not specified, defaults to false
 func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
 	return func(m optionalAttr) {
@@ -9467,6 +10066,11 @@ func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
 //
 // Input images can be of different types but output images are always float.
 //
+// The range of pixel values for the output image might be slightly different
+// from the range for the input image because of limited numerical precision.
+// To guarantee an output range, for example `[0.0, 1.0]`, apply
+// `tf.clip_by_value` to the output.
+//
 // Each output pixel is computed by first transforming the pixel's footprint into
 // the input tensor and then averaging the pixels that intersect the footprint. An
 // input pixel's contribution to the average is weighted by the fraction of its
@@ -10471,6 +11075,50 @@ func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow
 	return op.Output(0)
 }
 
+// Mutually reduces multiple tensors of identical type and shape.
+func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, merge_op string, final_op string, subdiv_offsets []int64) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "merge_op": merge_op, "final_op": final_op, "subdiv_offsets": subdiv_offsets}
+	opspec := tf.OpSpec{
+		Type: "CollectiveReduce",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// This op consumes a lock created by `MutexLock`.
+//
+// This op exists to consume a tensor created by `MutexLock` (other than
+// direct control dependencies).  It should be the only that consumes the tensor,
+// and will raise an error if it is not.  Its only purpose is to keep the
+// mutex lock tensor alive until it is consumed by this op.
+//
+// **NOTE**: This operation must run on the same device as its input.  This may
+// be enforced via the `colocate_with` mechanism.
+//
+// Arguments:
+//	mutex_lock: A tensor returned by `MutexLock`.
+//
+// Returns the created operation.
+func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConsumeMutexLock",
+		Input: []tf.Input{
+			mutex_lock,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Returns x / y element-wise for integer types.
 //
 // Truncation designates that negative numbers will round fractional quantities
@@ -10571,7 +11219,7 @@ func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_
 
 // Computes the maximum along segments of a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Computes a tensor such that
@@ -10620,6 +11268,21 @@ func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Receives a tensor value broadcast from another device.
+func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T, "group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "CollectiveBcastRecv",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Decode web-safe base64-encoded strings.
 //
 // Input may or may not have padding at the end. See EncodeBase64 for padding.
@@ -11452,6 +12115,35 @@ func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...
 	return op.Output(0), op.Output(1)
 }
 
+// Runs multiple additive regression ensemble predictors on input instances and
+//
+// computes the logits. It is designed to be used during prediction.
+// It traverses all the trees and calculates the final score for each instance.
+//
+// Arguments:
+//
+//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
+// feature.
+//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
+// shape.
+//
+// Returns Output rank 2 Tensor containing logits for each example.
+func BoostedTreesPredict(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (logits tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesPredict",
+		Input: []tf.Input{
+			tree_ensemble_handle, tf.OutputList(bucketized_features),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
 type MatrixSolveLsAttr func(optionalAttr)
 
@@ -11484,14 +12176,14 @@ func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
 // If `fast` is `True`, then the solution is computed by solving the normal
 // equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
 // \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
-// \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
+// If \\(m \lt n\\) then `output` is computed as
 // \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
 // minimum-norm solution to the under-determined linear system, i.e.
 // \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
 // subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
 // when \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
 // sufficiently large.
 //
 // If `fast` is `False` an algorithm based on the numerically robust complete
@@ -11739,6 +12431,47 @@ func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtyp
 	return op.Output(0)
 }
 
+// Subtracts sparse updates from the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] -= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] -= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterSub",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Inverse 2D fast Fourier transform.
 //
 // Computes the inverse 2-dimensional discrete Fourier transform over the
@@ -12337,9 +13070,8 @@ type FusedResizeAndPadConv2DAttr func(optionalAttr)
 
 // FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1),
-// which exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
 // If not specified, defaults to false
 func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
 	return func(m optionalAttr) {
@@ -13065,6 +13797,117 @@ func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Outpu
 	return scope.AddOperation(opspec)
 }
 
+// CudnnRNNBackpropAttr is an optional argument to CudnnRNNBackprop.
+type CudnnRNNBackpropAttr func(optionalAttr)
+
+// CudnnRNNBackpropRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNBackpropRnnMode(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNBackpropInputMode(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNBackpropDirection(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNBackpropDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropDropout(value float32) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNBackpropSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropSeed(value int64) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNBackpropSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropSeed2(value int64) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Backprop step of CudnnRNN.
+//
+// Compute the backprop of both data and weights in a RNN.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     The actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//     dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+// input: a 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: a 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: a 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: a 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: the same shape has input_h.
+// output_c: the same shape as input_c for LSTM. An empty tensor for other models.
+// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+//     pass.
+// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+//     pass.
+// reserve_space: The same reserve_space produced in for forward operation.
+// input_backprop: The backprop to input in the forward pass. Has the same shape
+//     as input.
+// input_h_backprop: The backprop to input_h in the forward pass. Has the same
+//     shape as input_h.
+// input_c_backprop: The backprop to input_c in the forward pass. Has the same
+//     shape as input_c.
+// params_backprop: The backprop to the params buffer in the forward pass. Has the
+//     same shape as params.
+func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, optional ...CudnnRNNBackpropAttr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNBackprop",
+		Input: []tf.Input{
+			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
 // FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
 type FractionalMaxPoolGradAttr func(optionalAttr)
 
@@ -13163,6 +14006,107 @@ func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator t
 	return scope.AddOperation(opspec)
 }
 
+// CudnnRNNCanonicalToParamsAttr is an optional argument to CudnnRNNCanonicalToParams.
+type CudnnRNNCanonicalToParamsAttr func(optionalAttr)
+
+// CudnnRNNCanonicalToParamsRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNCanonicalToParamsRnnMode(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNCanonicalToParamsInputMode(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNCanonicalToParamsDirection(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsDropout(value float32) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsSeed(value int64) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsSeed2(value int64) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Converts CudnnRNN params from canonical form to usable form.
+//
+// Writes a set of weights into the opaque params buffer so they can be used in
+// upcoming training or inferences.
+//
+// Note that the params buffer may not be compatible across different GPUs. So any
+// save and restoration should be converted to and from the canonical weights and
+// biases.
+//
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// weights: the canonical form of weights that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// biases: the canonical form of biases that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// num_params: number of parameter sets for all layers.
+//     Each layer may contain multiple parameter sets, with each set consisting of
+//     a weight matrix and a bias vector.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     The actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//     dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, weights []tf.Output, biases []tf.Output, optional ...CudnnRNNCanonicalToParamsAttr) (params tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNCanonicalToParams",
+		Input: []tf.Input{
+			num_layers, num_units, input_size, tf.OutputList(weights), tf.OutputList(biases),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
 type SparseReduceMaxSparseAttr func(optionalAttr)
 
@@ -13357,6 +14301,47 @@ func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
 	return op.Output(0)
 }
 
+// Multiplies sparse updates into the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] *= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] *= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions multiply.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMul",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Computes sigmoid of `x` element-wise.
 //
 // Specifically, `y = 1 / (1 + exp(-x))`.
@@ -13374,6 +14359,30 @@ func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+//     Updates specified rows with values in `v`.
+//
+//     Computes `x[i, :] = v; return x`.
+//
+// Arguments:
+//	x: A tensor of type `T`.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceUpdate(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InplaceUpdate",
+		Input: []tf.Input{
+			x, i, v,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // FusedBatchNormAttr is an optional argument to FusedBatchNorm.
 type FusedBatchNormAttr func(optionalAttr)
 
@@ -13584,6 +14593,43 @@ func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_
 	return op.Output(0)
 }
 
+// BoostedTreesEnsembleResourceHandleOpAttr is an optional argument to BoostedTreesEnsembleResourceHandleOp.
+type BoostedTreesEnsembleResourceHandleOpAttr func(optionalAttr)
+
+// BoostedTreesEnsembleResourceHandleOpContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func BoostedTreesEnsembleResourceHandleOpContainer(value string) BoostedTreesEnsembleResourceHandleOpAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// BoostedTreesEnsembleResourceHandleOpSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func BoostedTreesEnsembleResourceHandleOpSharedName(value string) BoostedTreesEnsembleResourceHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a BoostedTreesEnsembleResource
+func BoostedTreesEnsembleResourceHandleOp(scope *Scope, optional ...BoostedTreesEnsembleResourceHandleOpAttr) (resource tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesEnsembleResourceHandleOp",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Concatenates tensors along one dimension.
 //
 // Arguments:
@@ -13717,6 +14763,30 @@ func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output,
 	return op.Output(0)
 }
 
+// Returns the last element of the input list as well as a list with all but that element.
+//
+// Fails if the list is empty.
+//
+// input_handle: the input list
+// tensor: the withdrawn last element of the list
+// element_dtype: the type of elements in the list
+// element_shape: the shape of the output tensor
+func TensorListPopBack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType) (output_handle tf.Output, tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListPopBack",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Returns element-wise integer closest to x.
 //
 // If the result is midway between two representable values,
@@ -14471,6 +15541,26 @@ func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (i
 	return op.Output(0)
 }
 
+// Serializes the tree ensemble to a proto.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//
+// Returns Stamp token of the tree ensemble resource.Serialized proto of the ensemble.
+func BoostedTreesSerializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, tree_ensemble_serialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesSerializeEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // StageSizeAttr is an optional argument to StageSize.
 type StageSizeAttr func(optionalAttr)
 
@@ -14612,7 +15702,7 @@ func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_
 
 // Computes the sum along segments of a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Computes a tensor such that
@@ -14668,6 +15758,99 @@ func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Outp
 	return op.Output(0)
 }
 
+// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
+type CudnnRNNParamsSizeAttr func(optionalAttr)
+
+// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Computes size of weights that can be used by a Cudnn RNN model.
+//
+// Return the params size that can be used by the Cudnn RNN model. Subsequent
+// weight allocation and initialization should use this size.
+//
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   The actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//   dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+// params_size: The size of the params buffer that should be allocated and
+//   initialized for this RNN model. Note that this params buffer may not be
+//   compatible across GPUs. Please use CudnnRNNParamsWeights and
+//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
+//   across different runs.
+func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T, "S": S}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNParamsSize",
+		Input: []tf.Input{
+			num_layers, num_units, input_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes gradients for SparseSegmentMean.
 //
 // Returns tensor "output" with same shape as grad, except for dimension 0 whose
@@ -14696,6 +15879,7 @@ func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segm
 //
 // Note that this routine only supports wildcard characters in the
 // basename portion of the pattern, not in the directory portion.
+// Note also that the order of filenames returned can be non-deterministic.
 //
 // Arguments:
 //	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
@@ -15116,18 +16300,58 @@ func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceGatherAttr is an optional argument to ResourceGather.
-type ResourceGatherAttr func(optionalAttr)
+// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
+type StatelessMultinomialAttr func(optionalAttr)
 
-// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["output_dtype"] = value
 	}
 }
 
-// Gather slices from the variable pointed to by `resource` according to `indices`.
+// Draws samples from a multinomial distribution.
+//
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessMultinomial",
+		Input: []tf.Input{
+			logits, num_samples, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceGatherAttr is an optional argument to ResourceGather.
+type ResourceGatherAttr func(optionalAttr)
+
+// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Gather slices from the variable pointed to by `resource` according to `indices`.
 //
 // `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
 // Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
@@ -16505,11 +17729,8 @@ func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Out
 
 // Subtracts a value from the current value of a variable.
 //
-// Any ReadVariableOp which depends directly or indirectly on this assign is
-// guaranteed to see the incremented value or a subsequent newer one.
-//
-// Outputs the incremented value, which can be used to totally order the
-// increments to this variable.
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the decremented value or a subsequent newer one.
 //
 // Arguments:
 //	resource: handle to the resource in which to store the variable.
@@ -16594,9 +17815,8 @@ type QuantizedResizeBilinearAttr func(optionalAttr)
 
 // QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
 // If not specified, defaults to false
 func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
 	return func(m optionalAttr) {
@@ -16638,7 +17858,7 @@ func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min
 
 // Computes the minimum along segments of a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Computes a tensor such that
@@ -16677,8 +17897,8 @@ type SdcaOptimizerAttr func(optionalAttr)
 
 // SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
 //
-// value: Whether to use Adapative SDCA for the inner loop.
-// If not specified, defaults to false
+// value: Whether to use Adaptive SDCA for the inner loop.
+// If not specified, defaults to true
 func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
 	return func(m optionalAttr) {
 		m["adaptative"] = value
@@ -17248,7 +18468,7 @@ func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataTy
 // Duplicate entries are handled correctly: if multiple `indices` reference
 // the same location, their contributions add.
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]`.
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 //
 // <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 // <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
@@ -17526,6 +18746,43 @@ func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...Ran
 	return op.Output(0)
 }
 
+// Computes the product along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the product of all
+// entries belonging to a segment such that:
+//
+// \\(output_i = \prod_j data_j\\) where the product is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If there is no entry for a given segment ID `i`, it outputs 1.
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.
+//
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentProd",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // RandomUniformIntAttr is an optional argument to RandomUniformInt.
 type RandomUniformIntAttr func(optionalAttr)
 
@@ -18552,6 +19809,57 @@ func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, inp
 	return op.Output(0), op.Output(1)
 }
 
+// Rolls the elements of a tensor along an axis.
+//
+// The elements are shifted positively (towards larger indices) by the offset of
+// `shift` along the dimension of `axis`. Negative `shift` values will shift
+// elements in the opposite direction. Elements that roll passed the last position
+// will wrap around to the first and vice versa. Multiple shifts along multiple
+// axes may be specified.
+//
+// For example:
+//
+// ```
+// # 't' is [0, 1, 2, 3, 4]
+// roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
+//
+// # shifting along multiple dimensions
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+//
+// # shifting along the same axis multiple times
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
+// ```
+//
+// Arguments:
+//
+//	shift: Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
+// elements are shifted positively (towards larger indices) along the dimension
+// specified by `axis[i]`. Negative shifts will roll the elements in the opposite
+// direction.
+//	axis: Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
+// `shift[i]` should occur. If the same axis is referenced more than once, the
+// total shift for that axis will be the sum of all the shifts that belong to that
+// axis.
+//
+// Returns Has the same shape and size as the input. The elements are shifted
+// positively (towards larger indices) by the offsets of `shift` along the
+// dimensions of `axis`.
+func Roll(scope *Scope, input tf.Output, shift tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Roll",
+		Input: []tf.Input{
+			input, shift, axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // MapPeekAttr is an optional argument to MapPeek.
 type MapPeekAttr func(optionalAttr)
 
@@ -18690,6 +19998,68 @@ func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.O
 	return op.Output(0)
 }
 
+// Calculates gains for each feature and returns the best possible split information for the feature.
+//
+// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
+//
+// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
+//
+// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
+//
+// The length of output lists are all of the same length, `num_features`.
+// The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
+//
+// Arguments:
+//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
+//	stats_summary_list: A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
+//	l1: l1 regularization factor on leaf weights, per instance based.
+//	l2: l2 regularization factor on leaf weights, per instance based.
+//	tree_complexity: adjustment to the gain, per leaf based.
+//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
+//	max_splits: the number of nodes that can be split in the whole tree. Used as a dimension of output tensors.
+//
+// Returns An output list of Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A list of Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
+func BoostedTreesCalculateBestGainsPerFeature(scope *Scope, node_id_range tf.Output, stats_summary_list []tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, max_splits int64) (node_ids_list []tf.Output, gains_list []tf.Output, thresholds_list []tf.Output, left_node_contribs_list []tf.Output, right_node_contribs_list []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"max_splits": max_splits}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesCalculateBestGainsPerFeature",
+		Input: []tf.Input{
+			node_id_range, tf.OutputList(stats_summary_list), l1, l2, tree_complexity, min_node_weight,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if node_ids_list, idx, err = makeOutputList(op, idx, "node_ids_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if gains_list, idx, err = makeOutputList(op, idx, "gains_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if thresholds_list, idx, err = makeOutputList(op, idx, "thresholds_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if left_node_contribs_list, idx, err = makeOutputList(op, idx, "left_node_contribs_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if right_node_contribs_list, idx, err = makeOutputList(op, idx, "right_node_contribs_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	return node_ids_list, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list
+}
+
 // EncodePngAttr is an optional argument to EncodePng.
 type EncodePngAttr func(optionalAttr)
 
@@ -18804,9 +20174,8 @@ type ResizeBilinearGradAttr func(optionalAttr)
 
 // ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
 // If not specified, defaults to false
 func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
 	return func(m optionalAttr) {
@@ -19469,6 +20838,47 @@ func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = max(ref[indices, ...], updates[...])
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions are combined.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMax",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Outputs a `Summary` protocol buffer with scalar values.
 //
 // The input `tags` and `values` must have the same shape.  The generated summary
@@ -19853,6 +21263,88 @@ func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values
 	return op.Output(0)
 }
 
+// Returns a list list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
+//
+// tensor: The tensor to put on the list.
+// input_handle: The old list.
+// output_handle: A list with the elements of the old list followed by tensor.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func TensorListPushBack(scope *Scope, input_handle tf.Output, tensor tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListPushBack",
+		Input: []tf.Input{
+			input_handle, tensor,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of tensors in the input tensor list.
+//
+// input_handle: the input list
+// length: the number of tensors in the list
+func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListLength",
+		Input: []tf.Input{
+			input_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// The shape of the elements of the given list, as a tensor.
+//
+//   input_handle: the list
+//   element_shape: the shape of elements of the list
+func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape_type": shape_type}
+	opspec := tf.OpSpec{
+		Type: "TensorListElementShape",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the item in the list with the given index.
+//
+// input_handle: the list
+// index: the position in the list from which an element will be retrieved
+// item: the element at that position
+//
+//
+func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_dtype tf.DataType) (item tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListGetItem",
+		Input: []tf.Input{
+			input_handle, index,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the matrix exponential of one or more square matrices:
 //
 // exp(A) = \sum_{n=0}^\infty A^n/n!
@@ -19888,6 +21380,46 @@ func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
+// Computes the matrix logarithm of one or more square matrices:
+//
+//
+// log(exp(A)) = A
+//
+// This op is only defined for complex matrices. If A is positive-definite and
+// real, then casting to a complex matrix, taking the logarithm and casting back
+// to a real matrix will give the correct result.
+//
+// This function computes the matrix logarithm using the Schur-Parlett algorithm.
+// Details of the algorithm can be found in Section 11.6.2 of:
+// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
+// ISBN 978-0-898716-46-7.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the exponential for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.logm
+// @end_compatibility
+func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixLogarithm",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
 type QueueDequeueUpToV2Attr func(optionalAttr)
 
@@ -20067,7 +21599,8 @@ func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (ou
 // SelfAdjointEig.
 //
 // The result is a [..., M+1, M] matrix with [..., 0,:] containing the
-// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
+// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
+// are sorted in non-decreasing order.
 //
 // Arguments:
 //	input: Shape is `[..., M, M]`.
@@ -20125,7 +21658,8 @@ func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
 // Computes the eigen decomposition of one or more square self-adjoint matrices.
 //
 // Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
-// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.
+// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
+// are sorted in non-decreasing order.
 //
 // ```python
 // # a is a tensor.
@@ -20308,7 +21842,7 @@ func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output,
 
 // Computes the product along segments of a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Computes a tensor such that
@@ -21182,29 +22716,53 @@ func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Out
 	return op.Output(0), op.Output(1)
 }
 
-// Restore a Reader to its initial clean state.
+//     Adds v into specified rows of x.
+//
+//     Computes y = x; y[i, :] += v; return y.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
+//	x: A `Tensor` of type T.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
 //
-// Returns the created operation.
-func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderResetV2",
+		Type: "InplaceAdd",
 		Input: []tf.Input{
-			reader_handle,
+			x, i, v,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
-
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+// Restore a Reader to its initial clean state.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//
+// Returns the created operation.
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderResetV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
+
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
 //
 // value: If `True`, updating of the var, m, and v tensors will be protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
@@ -21290,9 +22848,8 @@ type ResizeBicubicGradAttr func(optionalAttr)
 
 // ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
 // If not specified, defaults to false
 func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
 	return func(m optionalAttr) {
@@ -21334,9 +22891,8 @@ type ResizeNearestNeighborAttr func(optionalAttr)
 
 // ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
 // If not specified, defaults to false
 func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
 	return func(m optionalAttr) {
@@ -21377,9 +22933,8 @@ type ResizeNearestNeighborGradAttr func(optionalAttr)
 
 // ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
 // If not specified, defaults to false
 func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
 	return func(m optionalAttr) {
@@ -21778,6 +23333,58 @@ func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
+// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//
+// Returns Stamp token of the tree ensemble resource.The number of trees in the tree ensemble resource.The number of trees that were finished successfully.The number of layers we attempted to build (but not necessarily succeeded).Rank size 2 tensor that contains start and end ids of the nodes in the latest
+// layer.
+func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesGetEnsembleStates",
+		Input: []tf.Input{
+			tree_ensemble_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Gets the next output from the given iterator.
+//
+// This operation is a synchronous version IteratorGetNext. It should only be used
+// in situations where the iterator does not block the calling thread, or where
+// the calling thread is not a member of the thread pool used to execute parallel
+// operations (e.g. in eager mode).
+func IteratorGetNextSync(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "IteratorGetNextSync",
+		Input: []tf.Input{
+			iterator,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNextSync", err)
+		return
+	}
+	return components
+}
+
 // SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
 type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
 
@@ -22326,6 +23933,83 @@ func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_ou
 	return op.Output(0)
 }
 
+// EncodeProtoAttr is an optional argument to EncodeProto.
+type EncodeProtoAttr func(optionalAttr)
+
+// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
+// If not specified, defaults to "local://"
+func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
+	return func(m optionalAttr) {
+		m["descriptor_source"] = value
+	}
+}
+
+// The op serializes protobuf messages provided in the input tensors.
+//
+// The types of the tensors in `values` must match the schema for the
+// fields specified in `field_names`. All the tensors in `values` must
+// have a common shape prefix, *batch_shape*.
+//
+// The `sizes` tensor specifies repeat counts for each field.  The repeat
+// count (last dimension) of a each tensor in `values` must be greater
+// than or equal to corresponding repeat count in `sizes`.
+//
+// A `message_type` name must be provided to give context for the field
+// names. The actual message descriptor can be looked up either in the
+// linked-in descriptor pool or a filename provided by the caller using
+// the `descriptor_source` attribute.
+//
+// The `descriptor_source` attribute selects a source of protocol
+// descriptors to consult when looking up `message_type`. This may be a
+// filename containing a serialized `FileDescriptorSet` message,
+// or the special value `local://`, in which case only descriptors linked
+// into the code will be searched; the filename can be on any filesystem
+// accessible to TensorFlow.
+//
+// You can build a `descriptor_source` file using the `--descriptor_set_out`
+// and `--include_imports` options to the protocol compiler `protoc`.
+//
+// The `local://` database only covers descriptors linked into the
+// code via C++ libraries, not Python imports. You can link in a proto descriptor
+// by creating a cc_library target with alwayslink=1.
+//
+// There are a few special cases in the value mapping:
+//
+// Submessage and group fields must be pre-serialized as TensorFlow strings.
+//
+// TensorFlow lacks support for unsigned int64s, so they must be
+// represented as `tf.int64` with the same twos-complement bit pattern
+// (the obvious way).
+//
+// Unsigned int32 values can be represented exactly with `tf.int64`, or
+// with sign wrapping if the input is of type `tf.int32`.
+//
+// Arguments:
+//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+//	values: List of tensors containing values for the corresponding field.
+//	field_names: List of strings containing proto field names.
+//	message_type: Name of the proto message type to decode.
+//
+// Returns Tensor of serialized protos with shape `batch_shape`.
+func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeProto",
+		Input: []tf.Input{
+			sizes, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a TensorArray for storing the gradients of values in the given handle.
 //
 // If the given TensorArray gradient already exists, returns a reference to it.
@@ -22386,6 +24070,132 @@ func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source
 	return op.Output(0), op.Output(1)
 }
 
+// DecodeProtoV2Attr is an optional argument to DecodeProtoV2.
+type DecodeProtoV2Attr func(optionalAttr)
+
+// DecodeProtoV2DescriptorSource sets the optional descriptor_source attribute to value.
+//
+// value: Either the special value `local://` or a path to a file containing
+// a serialized `FileDescriptorSet`.
+// If not specified, defaults to "local://"
+func DecodeProtoV2DescriptorSource(value string) DecodeProtoV2Attr {
+	return func(m optionalAttr) {
+		m["descriptor_source"] = value
+	}
+}
+
+// DecodeProtoV2MessageFormat sets the optional message_format attribute to value.
+//
+// value: Either `binary` or `text`.
+// If not specified, defaults to "binary"
+func DecodeProtoV2MessageFormat(value string) DecodeProtoV2Attr {
+	return func(m optionalAttr) {
+		m["message_format"] = value
+	}
+}
+
+// DecodeProtoV2Sanitize sets the optional sanitize attribute to value.
+//
+// value: Whether to sanitize the result or not.
+// If not specified, defaults to false
+func DecodeProtoV2Sanitize(value bool) DecodeProtoV2Attr {
+	return func(m optionalAttr) {
+		m["sanitize"] = value
+	}
+}
+
+// The op extracts fields from a serialized protocol buffers message into tensors.
+//
+// The `decode_proto` op extracts fields from a serialized protocol buffers
+// message into tensors.  The fields in `field_names` are decoded and converted
+// to the corresponding `output_types` if possible.
+//
+// A `message_type` name must be provided to give context for the field
+// names. The actual message descriptor can be looked up either in the
+// linked-in descriptor pool or a filename provided by the caller using
+// the `descriptor_source` attribute.
+//
+// Each output tensor is a dense tensor. This means that it is padded to
+// hold the largest number of repeated elements seen in the input
+// minibatch. (The shape is also padded by one to prevent zero-sized
+// dimensions). The actual repeat counts for each example in the
+// minibatch can be found in the `sizes` output. In many cases the output
+// of `decode_proto` is fed immediately into tf.squeeze if missing values
+// are not a concern. When using tf.squeeze, always pass the squeeze
+// dimension explicitly to avoid surprises.
+//
+// For the most part, the mapping between Proto field types and
+// TensorFlow dtypes is straightforward. However, there are a few
+// special cases:
+//
+// - A proto field that contains a submessage or group can only be converted
+// to `DT_STRING` (the serialized submessage). This is to reduce the
+// complexity of the API. The resulting string can be used as input
+// to another instance of the decode_proto op.
+//
+// - TensorFlow lacks support for unsigned integers. The ops represent uint64
+// types as a `DT_INT64` with the same twos-complement bit pattern
+// (the obvious way). Unsigned int32 values can be represented exactly by
+// specifying type `DT_INT64`, or using twos-complement if the caller
+// specifies `DT_INT32` in the `output_types` attribute.
+//
+// The `descriptor_source` attribute selects a source of protocol
+// descriptors to consult when looking up `message_type`. This may be a
+// filename containing a serialized `FileDescriptorSet` message,
+// or the special value `local://`, in which case only descriptors linked
+// into the code will be searched; the filename can be on any filesystem
+// accessible to TensorFlow.
+//
+// You can build a `descriptor_source` file using the `--descriptor_set_out`
+// and `--include_imports` options to the protocol compiler `protoc`.
+//
+// The `local://` database only covers descriptors linked into the
+// code via C++ libraries, not Python imports. You can link in a proto descriptor
+// by creating a cc_library target with alwayslink=1.
+//
+// Both binary and text proto serializations are supported, and can be
+// chosen using the `format` attribute.
+//
+// Arguments:
+//	bytes: Tensor of serialized protos with shape `batch_shape`.
+//	message_type: Name of the proto message type to decode.
+//	field_names: List of strings containing proto field names.
+//	output_types: List of TF types to use for the respective field in field_names.
+//
+// Returns Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+// Each entry is the number of values found for the corresponding field.
+// Optional fields may have 0 or 1 values.List of tensors containing values for the corresponding field.
+// `values[i]` has datatype `output_types[i]`
+// and shape `[batch_shape, max(sizes[...,i])]`.
+func DecodeProtoV2(scope *Scope, bytes tf.Output, message_type string, field_names []string, output_types []tf.DataType, optional ...DecodeProtoV2Attr) (sizes tf.Output, values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"message_type": message_type, "field_names": field_names, "output_types": output_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeProtoV2",
+		Input: []tf.Input{
+			bytes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	sizes = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("DecodeProtoV2", err)
+		return
+	}
+	return sizes, values
+}
+
 // Creates a dataset that splits a SparseTensor into elements row-wise.
 func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
@@ -22440,11 +24250,8 @@ func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset t
 
 // Adds a value to the current value of a variable.
 //
-// Any ReadVariableOp which depends directly or indirectly on this assign is
-// guaranteed to see the incremented value or a subsequent newer one.
-//
-// Outputs the incremented value, which can be used to totally order the
-// increments to this variable.
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the incremented value or a subsequent newer one.
 //
 // Arguments:
 //	resource: handle to the resource in which to store the variable.
@@ -23107,6 +24914,35 @@ func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Op
 	return scope.AddOperation(opspec)
 }
 
+// Makes the summary of accumulated stats for the batch.
+//
+// The summary stats contains gradients and hessians accumulated into the corresponding node and bucket for each example.
+//
+// Arguments:
+//	node_ids: int32 Rank 1 Tensor containing node ids, which each example falls into for the requested layer.
+//	gradients: float32; Rank 2 Tensor (shape=[#examples, 1]) for gradients.
+//	hessians: float32; Rank 2 Tensor (shape=[#examples, 1]) for hessians.
+//	bucketized_features_list: int32 list of Rank 1 Tensors, each containing the bucketized feature (for each feature column).
+//	max_splits: int; the maximum number of splits possible in the whole tree.
+//	num_buckets: int; equals to the maximum possible value of bucketized feature.
+//
+// Returns output Rank 4 Tensor (shape=[#features, #splits, #buckets, 2]) containing accumulated stats put into the corresponding node and bucket. The first index of 4th dimension refers to gradients, and the second to hessians.
+func BoostedTreesMakeStatsSummary(scope *Scope, node_ids tf.Output, gradients tf.Output, hessians tf.Output, bucketized_features_list []tf.Output, max_splits int64, num_buckets int64) (stats_summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"max_splits": max_splits, "num_buckets": num_buckets}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesMakeStatsSummary",
+		Input: []tf.Input{
+			node_ids, gradients, hessians, tf.OutputList(bucketized_features_list),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Adjust the contrast of one or more images.
 //
 // `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
@@ -23331,6 +25167,10 @@ func Identity(scope *Scope, input tf.Output) (output tf.Output) {
 // <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
 // </div>
 //
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, a 0 is stored in the
+// corresponding output value.
+//
 // Arguments:
 //	params: The tensor from which to gather values. Must be at least rank
 // `axis + 1`.
@@ -23827,6 +25667,28 @@ func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// List of the given size with empty elements.
+//
+// element_shape: the shape of the future elements of the list
+// num_elements: the number of elements to reserve
+// handle: the output list
+// element_dtype: the desired type of elements in the list.
+func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListReserve",
+		Input: []tf.Input{
+			element_shape, num_elements,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
 type PriorityQueueV2Attr func(optionalAttr)
 
@@ -24366,6 +26228,125 @@ func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional .
 	return op.Output(0)
 }
 
+// RpcAttr is an optional argument to Rpc.
+type RpcAttr func(optionalAttr)
+
+// RpcProtocol sets the optional protocol attribute to value.
+//
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func RpcProtocol(value string) RpcAttr {
+	return func(m optionalAttr) {
+		m["protocol"] = value
+	}
+}
+
+// RpcFailFast sets the optional fail_fast attribute to value.
+//
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func RpcFailFast(value bool) RpcAttr {
+	return func(m optionalAttr) {
+		m["fail_fast"] = value
+	}
+}
+
+// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+//
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func RpcTimeoutInMs(value int64) RpcAttr {
+	return func(m optionalAttr) {
+		m["timeout_in_ms"] = value
+	}
+}
+
+// Perform batches of RPC requests.
+//
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
+//
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the RPC method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
+//
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
+//
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
+//
+// then call this op with arguments:
+//
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
+//
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// If the connection fails or the remote worker returns an error
+// status, the op reraises this exception locally.
+//
+// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
+//
+// Arguments:
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
+//
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
+func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Rpc",
+		Input: []tf.Input{
+			address, method, request,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // OrderedMapStageAttr is an optional argument to OrderedMapStage.
 type OrderedMapStageAttr func(optionalAttr)
 
@@ -24604,30 +26585,140 @@ func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompresse
 	return op.Output(0)
 }
 
-// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
-//
-// Each comparison returns a boolean `true` (if `input_value > threshold`)
-// or and `false` otherwise.
-//
-// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
-// algorithms that use hashing approximations of cosine and `L2` distances;
-// codes can be generated from an input via:
-//
-// ```python
-// codebook_size = 50
-// codebook_bits = codebook_size * 32
-// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
-//                            dtype=x.dtype,
-//                            initializer=tf.orthogonal_initializer())
-// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
-// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
-// # now codes has shape x.shape[:-1] + [codebook_size]
-// ```
-//
-// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
-// by 8.
-//
-// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
+// CudnnRNNAttr is an optional argument to CudnnRNN.
+type CudnnRNNAttr func(optionalAttr)
+
+// CudnnRNNRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNRnnMode(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNInputMode(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNDirection(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNDropout(value float32) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed(value int64) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed2(value int64) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// CudnnRNNIsTraining sets the optional is_training attribute to value.
+// If not specified, defaults to true
+func CudnnRNNIsTraining(value bool) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// A RNN backed by cuDNN.
+//
+// Computes the RNN from the input and initial states, with respect to the params
+// buffer.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   The actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//   dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+// input: a 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: a 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: a 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: a 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: the same shape has input_h.
+// output_c: the same shape as input_c for LSTM. An empty tensor for other models.
+// is_training: Indicates whether this operation is used for inferenece or
+//   training.
+// reserve_space: an opaque tensor that can be used in backprop calculation. It
+//   is only produced if is_training is false.
+func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNAttr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNN",
+		Input: []tf.Input{
+			input, input_h, input_c, params,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
+//
+// Each comparison returns a boolean `true` (if `input_value > threshold`)
+// or and `false` otherwise.
+//
+// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
+// algorithms that use hashing approximations of cosine and `L2` distances;
+// codes can be generated from an input via:
+//
+// ```python
+// codebook_size = 50
+// codebook_bits = codebook_size * 32
+// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
+//                            dtype=x.dtype,
+//                            initializer=tf.orthogonal_initializer())
+// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
+// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
+// # now codes has shape x.shape[:-1] + [codebook_size]
+// ```
+//
+// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
+// by 8.
+//
+// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
 // a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
 //
 // Arguments:
@@ -24697,6 +26788,47 @@ func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, val
 	return op.Output(0)
 }
 
+// EmptyAttr is an optional argument to Empty.
+type EmptyAttr func(optionalAttr)
+
+// EmptyInit sets the optional init attribute to value.
+//
+// value: If True, initialize the returned tensor with the default value of dtype.  Otherwise, the implementation is free not to initializethe tensor's content.
+// If not specified, defaults to false
+func EmptyInit(value bool) EmptyAttr {
+	return func(m optionalAttr) {
+		m["init"] = value
+	}
+}
+
+// Creates a tensor with the given shape.
+//
+// This operation creates a tensor of `shape` and `dtype`.
+//
+// Arguments:
+//	shape: 1-D. Represents the shape of the output tensor.
+//
+//
+// Returns A `Tensor` of type `T`.
+func Empty(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...EmptyAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Empty",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // TensorArrayConcatV3Attr is an optional argument to TensorArrayConcatV3.
 type TensorArrayConcatV3Attr func(optionalAttr)
 
@@ -24814,6 +26946,27 @@ func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output
 	return op.Output(0)
 }
 
+// Sets the index-th position of the list to contain the given tensor.
+//
+// input_handle: the list
+// index: the position in the list to which the tensor will be assigned
+// item: the element to be assigned to that position
+// output_handle: the new list, with the element in the proper position
+//
+func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListSetItem",
+		Input: []tf.Input{
+			input_handle, index, item,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns a diagonal tensor with a given diagonal values.
 //
 // Given a `diagonal`, this operation returns a tensor with the `diagonal` and
@@ -25358,6 +27511,27 @@ func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, val
 	return op.Output(0)
 }
 
+// Creates a tree ensemble model and returns a handle to it.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble resource to be created.
+//	stamp_token: Token to use as the initial value of the resource stamp.
+//	tree_ensemble_serialized: Serialized proto of the tree ensemble.
+//
+// Returns the created operation.
+func BoostedTreesCreateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesCreateEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Applies sparse addition to `input` using individual values or slices
 //
 // from `updates` according to indices `indices`.  The updates are non-aliasing:
@@ -26238,6 +28412,120 @@ func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []t
 	return output
 }
 
+// CudnnRNNParamsToCanonicalAttr is an optional argument to CudnnRNNParamsToCanonical.
+type CudnnRNNParamsToCanonicalAttr func(optionalAttr)
+
+// CudnnRNNParamsToCanonicalRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNParamsToCanonicalRnnMode(value string) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNParamsToCanonicalInputMode(value string) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNParamsToCanonicalDirection(value string) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsToCanonicalDropout(value float32) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsToCanonicalSeed(value int64) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsToCanonicalSeed2(value int64) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Retrieves CudnnRNN params in canonical form.
+//
+// Retrieves a set of weights from the opaque params buffer that can be saved and
+// restored in a way compatible with future runs.
+//
+// Note that the params buffer may not be compatible across different GPUs. So any
+// save and restoration should be converted to and from the canonical weights and
+// biases.
+//
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// num_params: number of parameter sets for all layers.
+//     Each layer may contain multiple parameter sets, with each set consisting of
+//     a weight matrix and a bias vector.
+// weights: the canonical form of weights that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// biases: the canonical form of biases that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     The actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//     dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+func CudnnRNNParamsToCanonical(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, params tf.Output, num_params int64, optional ...CudnnRNNParamsToCanonicalAttr) (weights []tf.Output, biases []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_params": num_params}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNParamsToCanonical",
+		Input: []tf.Input{
+			num_layers, num_units, input_size, params,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if weights, idx, err = makeOutputList(op, idx, "weights"); err != nil {
+		scope.UpdateErr("CudnnRNNParamsToCanonical", err)
+		return
+	}
+	if biases, idx, err = makeOutputList(op, idx, "biases"); err != nil {
+		scope.UpdateErr("CudnnRNNParamsToCanonical", err)
+		return
+	}
+	return weights, biases
+}
+
 // UniformCandidateSamplerAttr is an optional argument to UniformCandidateSampler.
 type UniformCandidateSamplerAttr func(optionalAttr)
 
@@ -26480,6 +28768,128 @@ func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
 	return op.Output(0)
 }
 
+// TryRpcAttr is an optional argument to TryRpc.
+type TryRpcAttr func(optionalAttr)
+
+// TryRpcProtocol sets the optional protocol attribute to value.
+//
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func TryRpcProtocol(value string) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["protocol"] = value
+	}
+}
+
+// TryRpcFailFast sets the optional fail_fast attribute to value.
+//
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func TryRpcFailFast(value bool) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["fail_fast"] = value
+	}
+}
+
+// TryRpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+//
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func TryRpcTimeoutInMs(value int64) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["timeout_in_ms"] = value
+	}
+}
+
+// Perform batches of RPC requests.
+//
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
+//
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
+//
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
+//
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
+//
+// then call this op with arguments:
+//
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
+//
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// Unlike the standard `Rpc` op, if the connection fails or the remote worker
+// returns an error status, this op does **not** reraise the exception.
+// Instead, the `status_code` and `status_message` entry for the corresponding RPC
+// call is set with the error returned from the RPC call.  The `response` tensor
+// will contain valid response values for those minibatch entries whose RPCs did
+// not fail; the rest of the entries will have empty strings.
+//
+// Arguments:
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
+//
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.Same shape as `request`.  Values correspond to tensorflow Status enum codes.Same shape as `request`.  Values correspond to Status messages
+// returned from the RPC calls.
+func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...TryRpcAttr) (response tf.Output, status_code tf.Output, status_message tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TryRpc",
+		Input: []tf.Input{
+			address, method, request,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // EnterAttr is an optional argument to Enter.
 type EnterAttr func(optionalAttr)
 
@@ -26915,6 +29325,64 @@ func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.
 	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
+// Deserializes a serialized tree ensemble config and replaces current tree
+//
+// ensemble.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//	stamp_token: Token to use as the new value of the resource stamp.
+//	tree_ensemble_serialized: Serialized proto of the ensemble.
+//
+// Returns the created operation.
+func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesDeserializeEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Runs multiple additive regression ensemble predictors on input instances and
+//
+// computes the update to cached logits. It is designed to be used during training.
+// It traverses the trees starting from cached tree id and cached node id and
+// calculates the updates to be pushed to the cache.
+//
+// Arguments:
+//
+//	cached_tree_ids: Rank 1 Tensor containing cached tree ids which is the starting
+// tree of prediction.
+//	cached_node_ids: Rank 1 Tensor containing cached node id which is the starting
+// node of prediction.
+//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
+// feature.
+//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
+// shape.
+//
+// Returns Rank 2 Tensor containing logits update (with respect to cached
+// values stored) for each example.Rank 1 Tensor containing new tree ids for each example.Rank 1 Tensor containing new node ids in the new tree_ids.
+func BoostedTreesTrainingPredict(scope *Scope, tree_ensemble_handle tf.Output, cached_tree_ids tf.Output, cached_node_ids tf.Output, bucketized_features []tf.Output, logits_dimension int64) (partial_logits tf.Output, tree_ids tf.Output, node_ids tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesTrainingPredict",
+		Input: []tf.Input{
+			tree_ensemble_handle, cached_tree_ids, cached_node_ids, tf.OutputList(bucketized_features),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Elementwise computes the bitwise AND of `x` and `y`.
 //
 // The result will have those bits set, that are set in both `x` and `y`. The
@@ -26951,6 +29419,44 @@ func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// TensorListStackAttr is an optional argument to TensorListStack.
+type TensorListStackAttr func(optionalAttr)
+
+// TensorListStackNumElements sets the optional num_elements attribute to value.
+// If not specified, defaults to -1
+func TensorListStackNumElements(value int64) TensorListStackAttr {
+	return func(m optionalAttr) {
+		m["num_elements"] = value
+	}
+}
+
+// Stacks all tensors in the list.
+//
+// Requires that all tensors have the same shape.
+//
+// input_handle: the input list
+// tensor: the gathered result
+// num_elements: optional. If not -1, the number of elements in the list.
+//
+func TensorListStack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListStack",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Elementwise computes the bitwise right-shift of `x` and `y`.
 //
 // Performs a logical shift for unsigned integer types, and an arithmetic shift
@@ -27000,6 +29506,175 @@ func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Outpu
 	return op.Output(0)
 }
 
+// BatchAttr is an optional argument to Batch.
+type BatchAttr func(optionalAttr)
+
+// BatchMaxEnqueuedBatches sets the optional max_enqueued_batches attribute to value.
+// If not specified, defaults to 10
+func BatchMaxEnqueuedBatches(value int64) BatchAttr {
+	return func(m optionalAttr) {
+		m["max_enqueued_batches"] = value
+	}
+}
+
+// BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
+// If not specified, defaults to <>
+func BatchAllowedBatchSizes(value []int64) BatchAttr {
+	return func(m optionalAttr) {
+		m["allowed_batch_sizes"] = value
+	}
+}
+
+// BatchContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func BatchContainer(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// BatchSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func BatchSharedName(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// BatchBatchingQueue sets the optional batching_queue attribute to value.
+// If not specified, defaults to ""
+func BatchBatchingQueue(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["batching_queue"] = value
+	}
+}
+
+// Batches all input tensors nondeterministically.
+//
+// When many instances of this Op are being run concurrently with the same
+// container/shared_name in the same device, some will output zero-shaped Tensors
+// and others will output Tensors of size up to max_batch_size.
+//
+// All Tensors in in_tensors are batched together (so, for example, labels and
+// features should be batched with a single instance of this operation.
+//
+// Each invocation of batch emits an `id` scalar which will be used to identify
+// this particular invocation when doing unbatch or its gradient.
+//
+// Each op which emits a non-empty batch will also emit a non-empty batch_index
+// Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
+// start, and length of elements of each set of Tensors present in batched_tensors.
+//
+// Batched tensors are concatenated along the first dimension, and all tensors in
+// in_tensors must have the first dimension of the same size.
+//
+// in_tensors: The tensors to be batched.
+// num_batch_threads: Number of scheduling threads for processing batches of work.
+//  Determines the number of batches processed in parallel.
+// max_batch_size: Batch sizes will never be bigger than this.
+// batch_timeout_micros: Maximum number of microseconds to wait before outputting
+//  an incomplete batch.
+// allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
+//  nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
+//  batches up to one of those sizes. The entries must increase monotonically, and
+//  the final entry must equal max_batch_size.
+// grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
+// batched_tensors: Either empty tensors or a batch of concatenated Tensors.
+// batch_index: If out_tensors is non-empty, has information to invert it.
+// container: Controls the scope of sharing of this batch.
+// id: always contains a scalar with a unique ID for this invocation of Batch.
+// shared_name: Concurrently running instances of batch in the same device with the
+//  same container and shared_name will batch their elements together. If left
+//  empty, the op name will be used as the shared name.
+// T: the types of tensors to be batched.
+func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_batch_size int64, batch_timeout_micros int64, grad_timeout_micros int64, optional ...BatchAttr) (batched_tensors []tf.Output, batch_index tf.Output, id tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_batch_threads": num_batch_threads, "max_batch_size": max_batch_size, "batch_timeout_micros": batch_timeout_micros, "grad_timeout_micros": grad_timeout_micros}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Batch",
+		Input: []tf.Input{
+			tf.OutputList(in_tensors),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if batched_tensors, idx, err = makeOutputList(op, idx, "batched_tensors"); err != nil {
+		scope.UpdateErr("Batch", err)
+		return
+	}
+	batch_index = op.Output(idx)
+	id = op.Output(idx)
+	return batched_tensors, batch_index, id
+}
+
+// UnbatchAttr is an optional argument to Unbatch.
+type UnbatchAttr func(optionalAttr)
+
+// UnbatchContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnbatchContainer(value string) UnbatchAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// UnbatchSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnbatchSharedName(value string) UnbatchAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Reverses the operation of Batch for a single output Tensor.
+//
+// An instance of Unbatch either receives an empty batched_tensor, in which case it
+// asynchronously waits until the values become available from a concurrently
+// running instance of Unbatch with the same container and shared_name, or receives
+// a non-empty batched_tensor in which case it finalizes all other concurrently
+// running instances and outputs its own element from the batch.
+//
+// batched_tensor: The possibly transformed output of Batch. The size of the first
+//  dimension should remain unchanged by the transformations for the operation to
+//  work.
+// batch_index: The matching batch_index obtained from Batch.
+// id: The id scalar emitted by Batch.
+// unbatched_tensor: The Tensor corresponding to this execution.
+// timeout_micros: Maximum amount of time (in microseconds) to wait to receive the
+//  batched input tensor associated with a given invocation of the op.
+// container: Container to control resource sharing.
+// shared_name: Instances of Unbatch with the same container and shared_name are
+//  assumed to possibly belong to the same batch. If left empty, the op name will
+//  be used as the shared name.
+func Unbatch(scope *Scope, batched_tensor tf.Output, batch_index tf.Output, id tf.Output, timeout_micros int64, optional ...UnbatchAttr) (unbatched_tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"timeout_micros": timeout_micros}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Unbatch",
+		Input: []tf.Input{
+			batched_tensor, batch_index, id,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
 type AvgPool3DGradAttr func(optionalAttr)
 
@@ -27212,6 +29887,60 @@ func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list
 	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
 }
 
+// UnbatchGradAttr is an optional argument to UnbatchGrad.
+type UnbatchGradAttr func(optionalAttr)
+
+// UnbatchGradContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnbatchGradContainer(value string) UnbatchGradAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// UnbatchGradSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnbatchGradSharedName(value string) UnbatchGradAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Gradient of Unbatch.
+//
+// Acts like Batch but using the given batch_index index of batching things as they
+// become available. This ensures that the gradients are propagated back in the
+// same session which did the forward pass.
+//
+// original_input: The input to the Unbatch operation this is the gradient of.
+// batch_index: The batch_index given to the Unbatch operation this is the gradient
+// of.
+// grad: The downstream gradient.
+// id: The id scalar emitted by Batch.
+// batched_grad: The return value, either an empty tensor or the batched gradient.
+// container: Container to control resource sharing.
+// shared_name: Instances of UnbatchGrad with the same container and shared_name
+//  are assumed to possibly belong to the same batch. If left empty, the op name
+//  will be used as the shared name.
+func UnbatchGrad(scope *Scope, original_input tf.Output, batch_index tf.Output, grad tf.Output, id tf.Output, optional ...UnbatchGradAttr) (batched_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnbatchGrad",
+		Input: []tf.Input{
+			original_input, batch_index, grad, id,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // DecodeWavAttr is an optional argument to DecodeWav.
 type DecodeWavAttr func(optionalAttr)
 
@@ -27317,6 +30046,60 @@ func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf
 	return op.Output(0)
 }
 
+//     Subtracts `v` into specified rows of `x`.
+//
+//     Computes y = x; y[i, :] -= v; return y.
+//
+// Arguments:
+//	x: A `Tensor` of type T.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceSub(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InplaceSub",
+		Input: []tf.Input{
+			x, i, v,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts a flat index or array of flat indices into a tuple of
+//
+// coordinate arrays.
+//
+// @compatibility(numpy)
+// Equivalent to np.unravel_index
+// @end_compatibility
+//
+// Arguments:
+//	indices: An 0-D or 1-D `int` Tensor whose elements are indices into the
+// flattened version of an array of dimensions dims.
+//	dims: An 1-D `int` Tensor. The shape of the array to use for unraveling
+// indices.
+//
+// Returns An 2-D (or 1-D if indices is 0-D) tensor where each row has the
+// same shape as the indices array.
+func UnravelIndex(scope *Scope, indices tf.Output, dims tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnravelIndex",
+		Input: []tf.Input{
+			indices, dims,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Compute the lower regularized incomplete Gamma function `Q(a, x)`.
 //
 // The lower regularized incomplete Gamma function is defined as:
@@ -27799,6 +30582,10 @@ func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values
 //
 //     indices.shape[:-1] + params.shape[indices.shape[-1]:]
 //
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, a 0 is stored in the
+// corresponding output value.
+//
 // Some examples below.
 //
 // Simple indexing into a matrix:
-- 
GitLab


From 48589205460a876a9ac783bd9b7fc3af99f8defb Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Wed, 18 Apr 2018 10:58:56 -0700
Subject: [PATCH 075/450] Fix issue where git_tag_override would fail if "-" in
 tag name.

---
 tensorflow/tools/git/gen_git_source.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index db2580755b..7f0f325119 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -164,14 +164,18 @@ def get_git_version(git_base_path, git_tag_override):
         "git", str("--git-dir=%s/.git" % git_base_path),
         str("--work-tree=" + git_base_path), "describe", "--long", "--tags"
     ]).strip())
-    if git_tag_override:
+    if git_tag_override and val:
       split_val = val.split("-")
-      if len(split_val) != 3:
+      if len(split_val) < 3:
         raise Exception(
             ("Expected git version in format 'TAG-COMMITS AFTER TAG-HASH' "
              "but got '%s'") % val)
-      split_val[0] = git_tag_override
-      val = bytes("-".join(split_val))
+      # There might be "-" in the tag name. But we can be sure that the final
+      # two "-" are those inserted by the git describe command.
+      commits_ahead_of_tag = split_val[-2]
+      abbrev_commit = split_val[-1]
+      val = bytes(
+          "-".join([git_tag_override, commits_ahead_of_tag, abbrev_commit]))
     return val if val else unknown_label
   except subprocess.CalledProcessError:
     return unknown_label
-- 
GitLab


From 5994156438a8d863dab04161589b34a3d0eb01d6 Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Wed, 18 Apr 2018 11:26:40 -0700
Subject: [PATCH 076/450] Fix gen_git_version script not being able to find git
 binary.

This error is happening on our Window's release builds. Making sure
we add git binary to the PATH for Bazel.
---
 tensorflow/tools/ci_build/windows/bazel/common_env.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index 7d4cc7ac30..0e6c0227b7 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -44,6 +44,8 @@ export PYTHON_LIB_PATH="C:/${PYTHON_BASE_PATH}/lib/site-packages"
 # Add python into PATH, it's needed because gen_git_source.py uses
 # '/usr/bin/env python' as a shebang
 export PATH="/c/${PYTHON_BASE_PATH}:$PATH"
+# Add git into PATH needed for gen_git_source.py
+export PATH="/c/Program Files/Git/cmd:$PATH"
 
 # Make sure we have pip in PATH
 export PATH="/c/${PYTHON_BASE_PATH}/Scripts:$PATH"
-- 
GitLab


From 31f925c7783fb8fa58278b31585dcf7bdb4cfd8c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 11:46:07 -0700
Subject: [PATCH 077/450] Change operands of subtraction expression to have
 well-defined behaviour.

At present, signed arithmetic overflows (i.e. has undefined behaviour) in general, e.g. when computing 0 - INT_MIN or INT_MAX - INT_MIN. The fact that we want the result in the unsigned type does not help us here.

The fix is to convert the operands to the corresponding unsigned type first and then perform the operation in unsigned arithmetic, which is well-defined and has the correct subtraction behaviour.

PiperOrigin-RevId: 193391813
---
 tensorflow/core/lib/random/random_distributions.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h
index ad16dbf01f..4cf3a999f6 100644
--- a/tensorflow/core/lib/random/random_distributions.h
+++ b/tensorflow/core/lib/random/random_distributions.h
@@ -164,7 +164,8 @@ class UniformDistribution<Generator, int32> {
   typedef int32 ResultElementType;
 
   // Must have lo < hi
-  UniformDistribution(int32 lo, int32 hi) : lo_(lo), range_(hi - lo) {}
+  UniformDistribution(int32 lo, int32 hi)
+      : lo_(lo), range_(static_cast<uint32>(hi) - static_cast<uint32>(lo)) {}
 
   PHILOX_DEVICE_INLINE
   ResultType operator()(Generator* gen) {
@@ -198,7 +199,8 @@ class UniformDistribution<Generator, int64> {
   typedef int64 ResultElementType;
 
   // Must have lo < hi
-  UniformDistribution(int64 lo, int64 hi) : lo_(lo), range_(hi - lo) {}
+  UniformDistribution(int64 lo, int64 hi)
+      : lo_(lo), range_(static_cast<uint64>(hi) - static_cast<uint64>(lo)) {}
 
   PHILOX_DEVICE_INLINE
   ResultType operator()(Generator* gen) {
-- 
GitLab


From 60444df318439654324ff797d66734c9920e48a2 Mon Sep 17 00:00:00 2001
From: Chris Kennelly <ckennelly@google.com>
Date: Wed, 18 Apr 2018 11:50:46 -0700
Subject: [PATCH 078/450] Expose an API for invoking sized delete.

Sized delete avoids a costly lookup to map the pointer to the allocated size
when this information is commonly available (Allocator::Deallocate).  As this
code also provides an alignment, we only use these paths when aligned new is
available.

PiperOrigin-RevId: 193392688
---
 tensorflow/core/framework/allocator.cc | 27 ++++++++++++++++++++++++++
 tensorflow/core/framework/allocator.h  | 11 ++++++++++-
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 1a7e5219cd..29b67ebdfa 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -48,6 +48,10 @@ constexpr size_t Allocator::kAllocatorAlignment;
 
 Allocator::~Allocator() {}
 
+void Allocator::DeallocateRaw(void* ptr, size_t alignment, size_t num_bytes) {
+  DeallocateRaw(ptr);
+}
+
 void RunResourceCtor(ResourceHandle* p, size_t n) {
   for (size_t i = 0; i < n; ++p, ++i) new (p) ResourceHandle();
 }
@@ -103,7 +107,12 @@ class CPUAllocator : public Allocator {
                    << "% of system memory.";
     }
 
+#ifdef __cpp_aligned_new
+    void* p =
+        ::operator new(num_bytes, static_cast<std::align_val_t>(alignment));
+#else
     void* p = port::AlignedMalloc(num_bytes, alignment);
+#endif
     if (cpu_allocator_collect_stats) {
       const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p);
       mutex_lock l(mu_);
@@ -132,7 +141,25 @@ class CPUAllocator : public Allocator {
       mutex_lock l(mu_);
       stats_.bytes_in_use -= alloc_size;
     }
+#ifdef __cpp_aligned_new
+    ::operator delete(ptr);
+#else
     port::AlignedFree(ptr);
+#endif
+  }
+
+  void DeallocateRaw(void* ptr, size_t alignment, size_t num_bytes) override {
+#ifdef __cpp_aligned_new
+    if (cpu_allocator_collect_stats) {
+      const std::size_t alloc_size =
+          port::MallocExtension_GetAllocatedSize(ptr);
+      mutex_lock l(mu_);
+      stats_.bytes_in_use -= alloc_size;
+    }
+    ::operator delete(ptr, num_bytes, static_cast<std::align_val_t>(alignment));
+#else
+    DeallocateRaw(ptr);
+#endif
   }
 
   void GetStats(AllocatorStats* stats) override {
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 2c87156dca..0dda38fbb7 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -101,6 +101,11 @@ class Allocator {
   // REQUIRES: "ptr" was previously returned by a call to AllocateRaw
   virtual void DeallocateRaw(void* ptr) = 0;
 
+  // Deallocate a block of memory pointer to by "ptr" with size "num_bytes"
+  // REQUIRES: "ptr" was previously returned by a call to AllocateRaw with
+  // "num_bytes" and "alignment"
+  virtual void DeallocateRaw(void* ptr, size_t alignment, size_t num_bytes);
+
   // Convenience functions to do typed allocation.  C++ constructors
   // and destructors are invoked for complex types if necessary,
   // depending on the concrete Allocator implementation. May return
@@ -132,7 +137,7 @@ class Allocator {
   void Deallocate(T* ptr, size_t num_elements) {
     if (ptr) {
       RunDtor<T>(ptr, num_elements);
-      DeallocateRaw(ptr);
+      DeallocateRaw(ptr, kAllocatorAlignment, sizeof(T) * num_elements);
     }
   }
 
@@ -304,6 +309,10 @@ class AllocatorWrapper : public Allocator {
 
   void DeallocateRaw(void* ptr) override { wrapped_->DeallocateRaw(ptr); }
 
+  void DeallocateRaw(void* ptr, size_t alignment, size_t num_bytes) override {
+    wrapped_->DeallocateRaw(ptr, alignment, num_bytes);
+  }
+
   bool TracksAllocationSizes() override {
     return wrapped_->TracksAllocationSizes();
   }
-- 
GitLab


From 03d18ae232c3cff4c56d1efec7bf29f9b16c4f68 Mon Sep 17 00:00:00 2001
From: Priya Gupta <priyag@google.com>
Date: Wed, 18 Apr 2018 12:03:32 -0700
Subject: [PATCH 079/450] Add support for initializable iterator in
 distribution strategies. Use that in estimator.

PiperOrigin-RevId: 193394603
---
 tensorflow/contrib/distribute/README.md       |  2 -
 .../distribute/python/minimize_loss_test.py   | 12 ++-
 .../distribute/python/mirrored_strategy.py    |  3 +-
 .../python/mirrored_strategy_multigpu_test.py |  3 +-
 .../distribute/python/one_device_strategy.py  |  7 +-
 .../distribute/python/optimizer_v2_test.py    |  3 +-
 .../distribute/python/prefetching_ops_v2.py   | 83 ++++++++++++++++---
 .../python/prefetching_ops_v2_test.py         | 22 +++++
 .../contrib/distribute/python/step_fn.py      |  3 +-
 .../contrib/distribute/python/values.py       | 22 ++---
 .../contrib/distribute/python/values_test.py  | 27 ++++++
 tensorflow/python/estimator/estimator.py      | 13 +--
 tensorflow/python/training/distribute.py      | 38 +++++----
 13 files changed, 174 insertions(+), 64 deletions(-)

diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md
index 5d22d9aa2b..44a4481021 100644
--- a/tensorflow/contrib/distribute/README.md
+++ b/tensorflow/contrib/distribute/README.md
@@ -131,8 +131,6 @@ adjusting your learning rate or batch size according to the number of GPUs.
 We are working on addressing this limitation by splitting each batch across GPUs
 instead.
 * PartitionedVariables are not supported yet.
-* Input pipelines with Datasets that capture stateful objects and rely on
-`make_initializable_iterator` are not supported yet.
 
 ## What's next?
 
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index 4219d54cbd..d7fbf7f379 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -67,7 +67,8 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       if is_tpu:
         dataset = dataset.batch(2)
 
-      iterator = distribution.distribute_dataset(dataset)
+      iterator = distribution.distribute_dataset(
+          dataset).make_one_shot_iterator()
 
       def run_step():
         # TODO(isaprykin): Make iterator get_next() return a list of sub-
@@ -127,7 +128,8 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
           use_callable_loss=True,
           create_optimizer_inside_model_fn=True)
 
-      iterator = distribution.distribute_dataset(dataset)
+      iterator = distribution.distribute_dataset(
+          dataset).make_one_shot_iterator()
 
       def run_step():
         return distribution.group(
@@ -185,7 +187,8 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       # on each device.
       if isinstance(distribution, mirrored_strategy.MirroredStrategy):
         distribution._prefetch_on_device = False
-      iterator = distribution.distribute_dataset(dataset)
+      iterator = distribution.distribute_dataset(
+          dataset).make_one_shot_iterator()
 
       def run_step():
         return control_flow_ops.group(
@@ -260,7 +263,8 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       features = dataset_ops.Dataset.from_tensors([[2.], [7.]])
       labels = dataset_ops.Dataset.from_tensors([[6.], [21.]])
       dataset = dataset_ops.Dataset.zip((features, labels)).repeat()
-      iterator = distribution.distribute_dataset(dataset)
+      iterator = distribution.distribute_dataset(
+          dataset).make_one_shot_iterator()
 
       def run_step():
         return distribution.group(
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index eb0edb3a11..d5e22e8100 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -141,9 +141,8 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
     return result
 
   def distribute_dataset(self, dataset):
-    per_device_dataset = values.PerDeviceDataset(
+    return values.PerDeviceDataset(
         dataset, self._devices, self._prefetch_on_device)
-    return per_device_dataset.make_one_shot_iterator()
 
   def _broadcast(self, tensor, destinations):
     # TODO(josh11b): In eager mode, use one thread per device, or async mode.
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index 9e9f06da8e..59cd6703b9 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -248,7 +248,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
     dist = mirrored_strategy.MirroredStrategy(
         ["/device:GPU:0", "/device:CPU:0"])
     features = dataset_ops.Dataset.from_tensors([[1.]]).repeat(10)
-    features = dist.distribute_dataset(features).get_next()
+    features = dist.distribute_dataset(
+        features).make_one_shot_iterator().get_next()
 
     with dist.scope():
       result = dist.call_for_each_tower(
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
index 39c49442b9..2002266dd5 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -21,8 +21,6 @@ from __future__ import print_function
 import six
 
 from tensorflow.contrib.distribute.python import values
-from tensorflow.contrib.eager.python import datasets
-from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -63,10 +61,7 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
       return next_creator(*args, **kwargs)
 
   def distribute_dataset(self, dataset):
-    if context.executing_eagerly():
-      return datasets.Iterator(dataset)
-    else:
-      return dataset.make_one_shot_iterator()
+    return dataset
 
   def _broadcast(self, tensor, destinations):
     return tensor
diff --git a/tensorflow/contrib/distribute/python/optimizer_v2_test.py b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
index a0912b625f..6e4d050073 100644
--- a/tensorflow/contrib/distribute/python/optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
@@ -42,7 +42,8 @@ class MinimizeLossOptimizerV2Test(test.TestCase, parameterized.TestCase):
       model_fn, dataset, layer = minimize_loss_example(
           optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
 
-      iterator = distribution.distribute_dataset(dataset)
+      iterator = distribution.distribute_dataset(
+          dataset).make_one_shot_iterator()
 
       def run_step():
         return control_flow_ops.group(distribution.unwrap(
diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
index dfcbb8568f..7b3670b45a 100644
--- a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
+++ b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
@@ -26,6 +26,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest as data_nest
 from tensorflow.python.data.util import sparse
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
@@ -34,26 +35,55 @@ from tensorflow.python.util import nest
 
 # pylint: disable=protected-access
 class _PrefetchToDeviceIterator(object):
-  """A replacement for @{tf.data.Iterator} that prefetches to another device."""
+  """A replacement for @{tf.data.Iterator} that prefetches to another device.
 
-  def __init__(self, input_dataset, devices, buffer_size):
+  Args:
+    input_dataset: The input dataset.
+    one_shot: If true, we make a one shot iterator that's already initialized.
+    devices: Devices on which to prefetch.
+    buffer_size: Size of the prefetching buffer.
+    shared_name: (Optional.) If non-empty, the returned iterator will be
+        shared under the given name across multiple sessions that share the
+        same devices (e.g. when using a remote server). Only used if one_shot
+        is False.
+
+  Returns:
+    An Iterator type object.
+  """
+
+  def __init__(self,
+               input_dataset,
+               one_shot,
+               devices,
+               buffer_size,
+               shared_name=None):
     self._input_dataset = input_dataset
     self._get_next_call_count = 0
+    self._one_shot = one_shot
+    if shared_name is None:
+      shared_name = ""
     self._devices = devices
-    input_iterator = input_dataset.make_one_shot_iterator()
-    input_iterator_handle = input_iterator.string_handle()
+
+    if self._one_shot:
+      self._input_iterator = input_dataset.make_one_shot_iterator()
+    else:
+      self._input_iterator = iterator_ops.Iterator.from_structure(
+          self._input_dataset.output_types, self._input_dataset.output_shapes,
+          shared_name, self._input_dataset.output_classes)
+    input_iterator_handle = self._input_iterator.string_handle()
 
     @function.Defun(dtypes.string)
     def _prefetch_fn(handle):
       """Prefetches one element from `input_iterator`."""
       remote_iterator = iterator_ops.Iterator.from_string_handle(
-          handle, input_iterator.output_types, input_iterator.output_shapes,
-          input_iterator.output_classes)
+          handle, self._input_iterator.output_types,
+          self._input_iterator.output_shapes,
+          self._input_iterator.output_classes)
       ret = remote_iterator.get_next()
       return nest.flatten(sparse.serialize_sparse_tensors(ret))
 
     target_device = gen_dataset_ops.iterator_get_device(
-        input_iterator._iterator_resource)
+        self._input_iterator._iterator_resource)
     self._buffering_resources = []
     for device in nest.flatten(self._devices):
       with ops.device(device):
@@ -61,9 +91,19 @@ class _PrefetchToDeviceIterator(object):
             f=_prefetch_fn,
             target_device=target_device,
             string_arg=input_iterator_handle,
-            buffer_size=buffer_size)
+            buffer_size=buffer_size,
+            shared_name=shared_name)
         self._buffering_resources.append(buffer_resource_handle)
 
+    if not self._one_shot:
+      reset_ops = []
+      for buffer_resource in self._buffering_resources:
+        reset_ops.append(
+            prefetching_ops.function_buffering_resource_reset(buffer_resource))
+      with ops.control_dependencies(reset_ops):
+        self._initializer = self._input_iterator.make_initializer(
+            self._input_dataset)
+
   def get_next(self, name=None):
     """See @{tf.data.Iterator.get_next}."""
     self._get_next_call_count += 1
@@ -92,6 +132,12 @@ class _PrefetchToDeviceIterator(object):
 
     return nest.pack_sequence_as(self._devices, flat_result)
 
+  @property
+  def initializer(self):
+    if self._one_shot:
+      raise NotImplementedError("Can't initialize a one_shot_iterator")
+    return self._initializer
+
   @property
   def output_classes(self):
     return self._input_dataset.output_classes
@@ -115,13 +161,24 @@ class _PrefetchToDeviceDataset(dataset_ops.Dataset):
     self._buffer_size = buffer_size if buffer_size is not None else 1
 
   def make_one_shot_iterator(self):
-    return _PrefetchToDeviceIterator(self._input_dataset, self._devices,
-                                     self._buffer_size)
+    return _PrefetchToDeviceIterator(
+        self._input_dataset,
+        one_shot=True,
+        devices=self._devices,
+        buffer_size=self._buffer_size)
 
   def make_initializable_iterator(self, shared_name=None):
-    raise NotImplementedError("`prefetch_to_devices()` is not currently "
-                              "compatible with initializable iterators. Use "
-                              "`make_one_shot_iterator()` instead.")
+    if context.executing_eagerly():
+      raise RuntimeError(
+          "make_initializable_iterator is not supported when eager "
+          "execution is enabled.")
+
+    return _PrefetchToDeviceIterator(
+        self._input_dataset,
+        one_shot=False,
+        devices=self._devices,
+        buffer_size=self._buffer_size,
+        shared_name=shared_name)
 
   def _as_variant_tensor(self):
     # TODO(mrry): Raise this error earlier (e.g. when one of the Dataset
diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py
index 8ed16f4607..a68dbce6c7 100644
--- a/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py
+++ b/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py
@@ -64,5 +64,27 @@ class PrefetchingOpsV2Test(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  def testPrefetchToTwoDevicesWithReinit(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops_v2.prefetch_to_devices(["/cpu:0", "/gpu:0"]))
+
+    iterator = device_dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(iterator.initializer)
+      for _ in range(5):
+        sess.run(next_element)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+      sess.run(iterator.initializer)
+      for _ in range(5):
+        sess.run(next_element)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/step_fn.py b/tensorflow/contrib/distribute/python/step_fn.py
index 82514c64be..68b8f4d626 100644
--- a/tensorflow/contrib/distribute/python/step_fn.py
+++ b/tensorflow/contrib/distribute/python/step_fn.py
@@ -54,7 +54,8 @@ class StandardInputStep(Step):
 
   def __init__(self, input_dataset, distribution):
     Step.__init__(self, distribution)
-    self._distributed_input = distribution.distribute_dataset(input_dataset)
+    self._distributed_input = distribution.distribute_dataset(
+        input_dataset).make_one_shot_iterator()
 
   def inputs(self):
     return self._distributed_input.get_next()
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index 87bf059038..18fedd2775 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -28,7 +28,6 @@ import six
 
 from tensorflow.contrib.data.python.ops import batching
 from tensorflow.contrib.distribute.python import prefetching_ops_v2
-from tensorflow.contrib.eager.python import datasets
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -510,6 +509,10 @@ class PerDeviceDataIterator(object):
     self._devices = devices
     self._prefetch_on_device = prefetch_on_device
 
+  @property
+  def initializer(self):
+    return self._iterator.initializer
+
   def get_next(self, name=None):
     """Scatter the input across devices."""
     if self._prefetch_on_device:
@@ -545,7 +548,8 @@ class PerDeviceDataset(object):
         "Prefetching is only supported in graph mode currently")
 
     if self._prefetch_on_device:
-      self._dataset = dataset
+      self._dataset = dataset.apply(
+          prefetching_ops_v2.prefetch_to_devices(self._devices))
     else:
       # TODO(priyag): If dropping remainder is not appropriate, find another
       # approach to distributing the dataset when not possible to divide evenly.
@@ -555,15 +559,13 @@ class PerDeviceDataset(object):
 
   def make_one_shot_iterator(self):
     """Get a one time use iterator for the distributed PerDeviceDataset."""
-    if self._prefetch_on_device:
-      on_device_dataset = self._dataset.apply(
-          prefetching_ops_v2.prefetch_to_devices(self._devices))
-      dataset_iterator = on_device_dataset.make_one_shot_iterator()
-    elif context.executing_eagerly():
-      dataset_iterator = datasets.Iterator(self._dataset)
-    else:
-      dataset_iterator = self._dataset.make_one_shot_iterator()
+    dataset_iterator = self._dataset.make_one_shot_iterator()
+    return PerDeviceDataIterator(
+        dataset_iterator, self._devices, self._prefetch_on_device)
 
+  def make_initializable_iterator(self):
+    """Get an initializable iterator for the distributed PerDeviceDataset."""
+    dataset_iterator = self._dataset.make_initializable_iterator()
     return PerDeviceDataIterator(
         dataset_iterator, self._devices, self._prefetch_on_device)
 
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
index 5c0d4b7d6c..e96ce54741 100644
--- a/tensorflow/contrib/distribute/python/values_test.py
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import device_util
 from tensorflow.python.training import saver as saver_lib
@@ -408,6 +409,32 @@ class PerDeviceDatasetTest(test.TestCase):
     expected_values = [[i, i+1] for i in range(0, 10, 2)]
     self._test_iterator(devices, dataset, expected_values)
 
+  def testInitializableIterator(self):
+    with context.graph_mode():
+      devices = ["/device:CPU:0"]
+      # Using random input since that is only allowed with initializable
+      # iterator.
+      dataset = dataset_ops.Dataset.from_tensor_slices(
+          random_ops.random_uniform((10,)))
+
+      per_device_dataset = values.PerDeviceDataset(
+          dataset, devices, prefetch_on_device=False)
+      iterator = per_device_dataset.make_initializable_iterator()
+
+      self.evaluate(iterator.initializer)
+      next_element = iterator.get_next()
+      for _ in range(10):
+        self.evaluate(next_element)
+
+      # Should fail after the input is finished.
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(next_element)
+
+      # After re-initializing the iterator, should be able to iterate again.
+      self.evaluate(iterator.initializer)
+      for _ in range(10):
+        self.evaluate(next_element)
+
 
 @test_util.with_c_api
 class MirroredVariableTest(test.TestCase):
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 4d3eff71ad..dde463aaf4 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -700,15 +700,10 @@ class Estimator(object):
     input_hooks = []
     if isinstance(result, dataset_ops.Dataset):
       if self._distribution is not None and mode == model_fn_lib.ModeKeys.TRAIN:
-        # TODO(josh11b): This is currently using a one-shot iterator, we
-        # will update this to an initializeable iterator once the
-        # necessory support for creating an initializable iterator is
-        # available.
-        result = self._distribution.distribute_dataset(result).get_next()
-      else:
-        iterator = result.make_initializable_iterator()
-        input_hooks.append(_DatasetInitializerHook(iterator))
-        result = iterator.get_next()
+        result = self._distribution.distribute_dataset(result)
+      iterator = result.make_initializable_iterator()
+      input_hooks.append(_DatasetInitializerHook(iterator))
+      result = iterator.get_next()
     if isinstance(result, (list, tuple)):
       if len(result) != 2:
         raise ValueError(
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index c6b2dcdf98..d855c4f551 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -391,7 +391,8 @@ class DistributionStrategy(object):
 
     ```
     with my_distribution.scope():
-      iterator = my_distribution.distribute_dataset(dataset)
+      iterator = my_distribution.distribute_dataset(
+          dataset).make_one_shot_iterator()
       tower_train_ops = my_distribution.call_for_each_tower(
           tower_fn, iterator.get_next())
       train_op = tf.group(my_distribution.unwrap(tower_train_ops))
@@ -404,8 +405,14 @@ class DistributionStrategy(object):
     `tower_fn` can use the `get_tower_context()` API to get enhanced
     behavior in this case.
 
-    Note that in the future we will add support for initializable
-    Dataset iterators, at which point this example code will change.
+    You can also create an initializable iterator instead of one shot iterator.
+    In that case, you will need to ensure that you initialize the iterator
+    before calling get_next.
+    ```
+    iterator = my_distribution.distribute_dataset(
+        dataset).make_initializable_iterator())
+    session.run(iterator.initializer)
+    ```
 
   * If you want to write a distributed algorithm, you may use any of
     the `DistributionStrategy` APIs inside a
@@ -486,8 +493,8 @@ class DistributionStrategy(object):
     a variable (which by definition will have locality V(`v`), though
     will match another locality if inside a `colocate_vars_with`
     scope).
-  * `d.distribute_dataset(dataset)`: in cross-tower context, produces an
-    iterator with locality T
+  * `d.distribute_dataset(dataset).make_one_shot_iterator()`: in cross-tower
+    context, produces an iterator with locality T
   * `d.broadcast(t)`: in cross-tower context, produces a value with locality M
   * `d.broadcast(t, v)`: in cross-tower context, produces a value with
     locality V(`v`)
@@ -510,7 +517,7 @@ class DistributionStrategy(object):
 
   The standard pattern for updating variables is to:
 
-  1. Wrap your input dataset in `d.distribute_dataset()`.
+  1. Wrap your input dataset in `d.distribute_dataset()` and create an iterator.
   2. Define each tower `d.call_for_each_tower()` up to the point of
      getting a list of gradient, variable pairs.
   3. Call `d.reduce("sum", t, v)` or `d.batch_reduce()` to sum the
@@ -665,16 +672,19 @@ class DistributionStrategy(object):
     _require_distribution_strategy_scope(self)
     return variable_scope.variable_creator_scope(create_colocated_variable)
 
-  # TODO(josh11b): Currently this returns an iterator, but should return
-  # something implementing (a subset of) the Dataset API.
+  # TODO(josh11b): `PerDeviceDataset` currently only implements a few methods of
+  # Dataset API such as make_one_shot_iterator and make_initializable_iterator.
+  # Extend to implement more functionality of datasets.
   def distribute_dataset(self, dataset):
-    """Return an iterator into `dataset` split across all towers.
+    """Return a `dataset` split across all towers.
 
-    Suitable for providing input to for `call_for_each_tower()`, as in:
+    Suitable for providing input to for `call_for_each_tower()` by creating an
+    iterator:
 
     ```
     with distribution_strategy.scope():
-      iterator = distribution_strategy.distribute_dataset(dataset)
+      distributed_dataset = distribution_strategy.distribute_dataset(dataset)
+      iterator = distributed_dataset.make_one_shot_iterator()
       tower_results = distribution_strategy.call_for_each_tower(
           tower_fn, iterator.get_next())
     ```
@@ -683,7 +693,7 @@ class DistributionStrategy(object):
       dataset: A `tf.data.Dataset`.
 
     Returns:
-      A Dataset iterator that will produce separate splits for each tower.
+      A `PerDeviceDataset` that will produce data for each tower.
     """
     raise NotImplementedError("must be implemented in descendants")
 
@@ -1126,9 +1136,7 @@ class _DefaultDistributionStrategy(DistributionStrategy):
     return ops.colocate_with(colocate_with_variable)
 
   def distribute_dataset(self, dataset):
-    # TODO(josh11b): Support for this when executing eagerly is currently only
-    # in contrib.
-    return dataset.make_one_shot_iterator()
+    return dataset
 
   def _broadcast(self, tensor, destinations):
     if destinations is None:
-- 
GitLab


From f0aabfa0139cb83c857e6142286d025515fbf9a1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 12:10:51 -0700
Subject: [PATCH 080/450] Make toco generate uint8 weights that are safe for
 fast int8 kernels.

PiperOrigin-RevId: 193395910
---
 tensorflow/contrib/lite/toco/BUILD            |   1 +
 tensorflow/contrib/lite/toco/args.h           |   1 +
 ...int8_weights_safe_for_fast_int8_kernels.cc | 209 ++++++++++++++++++
 .../graph_transformations.h                   |  13 ++
 .../contrib/lite/toco/toco_cmdline_flags.cc   |   9 +
 tensorflow/contrib/lite/toco/toco_flags.proto |   7 +-
 tensorflow/contrib/lite/toco/toco_tooling.cc  |   5 +
 7 files changed, 244 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc

diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index f696f4b845..3f73ef620e 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -219,6 +219,7 @@ cc_library(
         "graph_transformations/drop_fake_quant.cc",
         "graph_transformations/drop_im2col_arrays.cc",
         "graph_transformations/ensure_bias_vectors.cc",
+        "graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc",
         "graph_transformations/experimental_shuffle_fc_weights.cc",
         "graph_transformations/fuse_activation_functions.cc",
         "graph_transformations/fuse_binary_into_following_affine.cc",
diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
index c9662d05ce..fe30b88344 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -240,6 +240,7 @@ struct ParsedTocoFlags {
   Arg<bool> debug_disable_recurrent_cell_fusion = Arg<bool>(false);
   Arg<bool> drop_control_dependency = Arg<bool>(false);
   Arg<bool> propagate_fake_quant_num_bits = Arg<bool>(false);
+  Arg<bool> allow_nudging_weights_to_use_fast_gemm_kernel = Arg<bool>(false);
 };
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc b/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
new file mode 100644
index 0000000000..394fa349e2
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
@@ -0,0 +1,209 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// === Summary ===
+//
+// TLDR: Some of our 8-bit arithmetic operations require uint8 weight values
+// to avoid the value 0, thus ranging only in [1, 255]. This enables faster
+// runtime arithmetic kernels on ARM NEON. This is not relevant on most
+// other hardware architectures, and will cease to be relevant on ARM NEON
+// in the future. These topics are elaborated below ("Context").
+//
+// Having just one isolated uint8 value equal to 0 is fine. The bad case is when
+// two uint8 values are both zero and are less than 16 bytes apart.
+//
+// By default, toco generates a fatal error when that happens. The user may opt
+// in to more lax behavior by passing
+//   --allow_nudging_weights_to_use_fast_gemm_kernel.
+// This causes toco to nudge such bad 0 values into the value 1, thus avoiding
+// the problem in exchange for compromising on accuracy.
+//
+// The present graph transformation implements both the default fatal-erroring
+// behavior, and, when allow_nudging_weights is set, also the lax nudging
+// behavior.
+//
+//
+// === Context ===
+//
+// Since March 2017, we have been using a trick to perform faster
+// 8bit matrix multiplications, to our knowledge first implemented in gemmlowp
+// here:
+//   https://github.com/google/gemmlowp/commit/25b2989415b99e797e1ab977837111b2e231f81f
+//
+// This trick is explained in Appendix B of our paper,
+//   https://arxiv.org/abs/1712.05877
+//
+// Here is the relevant paragraph:
+//
+//      For efficient NEON implementation of the matrix multiplication’s
+//      core accumulation, we use the following trick.
+//      In the multiply-add operation in (10), we first change the
+//      operands’ type from uint8 to int8 (which can be done by
+//      subtracting 128 from the quantized values and zero-points).
+//      Thus the core multiply-add becomes
+//
+//            int32 += int8 * int8. (B.1)
+//
+//      As mentioned in section 3, with a minor tweak of the quantized
+//      training process, we can ensure that the weights, once
+//      quantized as int8 values, never take the value −128. Hence,
+//      the product in (B.1) is never −128 ∗ −128, and is therefore
+//      always less than 2^14 in absolute value. Hence, (B.1)
+//      can accumulate two products on a local int16 accumulator
+//      before that needs to be accumulated into the true int32 accumulator.
+//      This allows the use of an 8-way SIMD multiplication
+//      (SMULL on int8 operands), followed by an 8-way
+//      SIMD multiply-add (SMLAL on int8 operands), followed
+//      by a pairwise-add-and-accumulate into the int32 accumulators
+//      (SADALP).
+//
+// As that paragraph notes, quantized training should be suitably modified to
+// ensure that quantized uint8 weights value only range in [1, 255]. So the
+// problem that we are dealing with is only about the existing 8-bit quantized
+// models that haven't been trained specifically to get 8-bit weights only in
+// [1, 255].
+//
+// This spreadsheet shows the speed benefit of this trick across many existing
+// ARM-architecture CPUs:
+//
+//    https://docs.google.com/spreadsheets/d/1-0LjdMvW0XtH1bYknC0bQINoFaxjTuL9eplZZcitykI/edit?usp=sharing
+//
+// Compare Row 18 (fast int8 trick) to Row 20 (regular uint8 kernel).
+//
+// The introduction of the 'dotprod' extension to ARM NEON, specifically the
+// SDOT instruction, renders this eventually moot. See the experimental
+// kernels contributed by ARM here,
+//
+//     https://github.com/google/gemmlowp/pull/116
+//
+// However, as of April 2018, there don't seem to be any commercially available
+// CPU supporting these instructions (yet); we are waiting for
+// Cortex-A{75,55}-r1 to become available; the "-r1" is key here. Even if such
+// CPUs become available soon, it will presumably take years for them to
+// overtake the large volume of existing CPUs not supporting these new
+// instructions, especially in current and future low-end devices. All in all,
+// we can foresee these 'fast int8 kernels' to remain important to have into
+// the 2020s.
+//
+bool EnsureUint8WeightsSafeForFastInt8Kernels::Run(Model* model,
+                                                   std::size_t op_index) {
+  const auto& op = *model->operators[op_index];
+  int weights_index = 0;
+  switch (op.type) {
+    case OperatorType::kConv:
+      weights_index = 1;
+      break;
+    case OperatorType::kLstmCell:
+      weights_index = 2;
+      break;
+    case OperatorType::kFullyConnected: {
+      weights_index = 1;
+      const auto& fc_op = static_cast<const toco::FullyConnectedOperator&>(op);
+      CHECK(!fc_op.experimental_shuffled_weights)
+          << "This graph transformation expects to run before FC weights get "
+             "shuffled.";
+      break;
+    }
+    default:
+      // Other operator types are unaffected by this graph transformation,
+      // because their runtime implementations don't use the fast int8 trick.
+      // In particular that's the case of DepthwiseConv at the moment.
+      // We have to update this logic when that changes, e.g. if in the future
+      // some DepthwiseConv kernel wants to use the trick.
+      //
+      // The reason why that's not so likely, hence why it's fairly safe to
+      // stay conservative in the list of operators that we handle here, is that
+      // the fast int8 kernel trick is only applicable to ops that either are
+      // implemented as a GEMM, or use symmetric ranges for both weights and
+      // activations. The reason why GEMM is special (can use the trick even
+      // without symmetric ranges) is that it is so arithmetic-intense that
+      // it can use techniques reducing its implementation to the symmetric
+      // ranges case, with limited relative overhead (O(N^2) overhead vs
+      // O(N^3) GEMM cost). See https://arxiv.org/pdf/1712.05877, section
+      // 2.3 Efficient handling of zero-points.
+      //
+      // That's why at the moment we only handle operators that use a GEMM
+      // (Conv, fully-connected --- note that LSTM merely wraps a
+      // fully-connected operator).
+      return false;
+  }
+
+  const string& name = op.inputs[weights_index];
+  auto& array = model->GetArray(name);
+  if (!array.buffer) {
+    return false;
+  }
+  if (array.data_type != ArrayDataType::kUint8) {
+    return false;
+  }
+  auto& buffer_data = array.GetMutableBuffer<ArrayDataType::kUint8>().data;
+
+  int count_bad = 0;
+  int index_of_previous_bad_value = 0;
+  bool changed = false;
+
+  for (int i = 0; i < buffer_data.size(); i++) {
+    if (buffer_data[i] == 0) {
+      count_bad++;
+      if (count_bad > 1) {
+        const int distance = i - index_of_previous_bad_value;
+        // Semi-arbitrary threshold. The idea is that trouble only occurs
+        // when two bad values are very close to each other so that they
+        // are jointly used within registers inside some GEMM kernel.
+        // The details of that depend on the kernel. Our current fast ARM64
+        // kernel, for instance, only has an issue when the distance between
+        // consecutive bad values is exactly 8. We do not want to track such
+        // kernel details too closely here, so we pick a threshold that's
+        // a bit larger than that, to give us room to change kernels in the
+        // future without worrying.
+        static constexpr int kMinDistanceBetweenBadValues = 16;
+        if (distance < kMinDistanceBetweenBadValues) {
+          if (allow_nudging_weights()) {
+            buffer_data[i] = 1;
+            changed = true;
+            continue;
+          }
+          LOG(FATAL) << "Bad value for " << name << " at index " << i
+                     << ", previous bad value at index "
+                     << index_of_previous_bad_value << ", distance=" << distance
+                     << ", kMinDistanceBetweenBadValues="
+                     << kMinDistanceBetweenBadValues << ". Consider passing "
+                     << "--allow_nudging_weights_to_use_fast_gemm_kernel "
+                     << "if you don't care about accuracy.";
+        }
+      }
+      index_of_previous_bad_value = i;
+    }
+  }
+
+  if (changed) {
+    AddMessageF("Tweaked weights values for %s", LogName(op));
+  }
+
+  return changed;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 8075d0205d..72ffd51db4 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -246,6 +246,19 @@ class ResolveConstantFakeQuant : public GraphTransformation {
   bool propagate_fake_quant_num_bits_ = false;
 };
 
+class EnsureUint8WeightsSafeForFastInt8Kernels : public GraphTransformation {
+ public:
+  bool Run(Model* model, std::size_t op_index) override;
+  const char* Name() const override {
+    return "EnsureUint8WeightsSafeForFastInt8Kernels";
+  }
+  bool allow_nudging_weights() const { return allow_nudging_weights_; }
+  void set_allow_nudging_weights(bool val) { allow_nudging_weights_ = val; }
+
+ private:
+  bool allow_nudging_weights_ = false;
+};
+
 #undef DECLARE_GRAPH_TRANSFORMATION
 
 }  // end namespace toco
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
index 74f98c8452..1611c4d0c0 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
@@ -141,6 +141,13 @@ bool ParseTocoFlagsFromCommandLineFlags(
            parsed_flags.propagate_fake_quant_num_bits.default_value(),
            "If true, use FakeQuant* operator num_bits attributes to adjust "
            "array data_types."),
+      Flag("allow_nudging_weights_to_use_fast_gemm_kernel",
+           parsed_flags.allow_nudging_weights_to_use_fast_gemm_kernel.bind(),
+           parsed_flags.allow_nudging_weights_to_use_fast_gemm_kernel
+               .default_value(),
+           "Some fast uint8 GEMM kernels require uint8 weights to avoid the "
+           "value 0. This flag allows nudging them to 1 to allow proceeding, "
+           "with moderate inaccuracy."),
   };
   bool asked_for_help =
       *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
@@ -230,6 +237,8 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
   READ_TOCO_FLAG(drop_control_dependency, FlagRequirement::kNone);
   READ_TOCO_FLAG(debug_disable_recurrent_cell_fusion, FlagRequirement::kNone);
   READ_TOCO_FLAG(propagate_fake_quant_num_bits, FlagRequirement::kNone);
+  READ_TOCO_FLAG(allow_nudging_weights_to_use_fast_gemm_kernel,
+                 FlagRequirement::kNone);
 
   // Deprecated flag handling.
   if (parsed_toco_flags.input_type.specified()) {
diff --git a/tensorflow/contrib/lite/toco/toco_flags.proto b/tensorflow/contrib/lite/toco/toco_flags.proto
index 869c512d93..a04017a6bf 100644
--- a/tensorflow/contrib/lite/toco/toco_flags.proto
+++ b/tensorflow/contrib/lite/toco/toco_flags.proto
@@ -37,7 +37,7 @@ enum FileFormat {
 // of as properties of models, instead describing how models are to be
 // processed in the context of the present tooling job.
 //
-// Next ID to use: 17.
+// Next ID to use: 18.
 message TocoFlags {
   // Input file format
   optional FileFormat input_format = 1;
@@ -156,4 +156,9 @@ message TocoFlags {
   // Input and output array data types may change because of this propagation
   // and users must be sure to query the final data_type values.
   optional bool propagate_fake_quant_num_bits = 14;
+
+  // Some fast uint8 GEMM kernels require uint8 weights to avoid the value 0.
+  // This flag allows nudging them to 1 to allow proceeding, with moderate
+  // inaccuracy.
+  optional bool allow_nudging_weights_to_use_fast_gemm_kernel = 17;
 }
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 89cb2f85f8..7252ec2ea4 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -317,12 +317,17 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
     }
 
     CheckIsReadyForQuantization(*model);
+    auto* ensure_safe_for_int8_kernels =
+        new EnsureUint8WeightsSafeForFastInt8Kernels;
+    ensure_safe_for_int8_kernels->set_allow_nudging_weights(
+        toco_flags.allow_nudging_weights_to_use_fast_gemm_kernel());
     RunGraphTransformations(model, "quantization graph transformations",
                             {
                                 new RemoveTrivialQuantizedActivationFunc,
                                 new RemoveTrivialQuantizedMinMax,
                                 new Quantize,
                                 new RemoveFinalDequantizeOp,
+                                ensure_safe_for_int8_kernels,
                             });
   } else {
     GraphTransformationsSet dequantization_transformations{new Dequantize};
-- 
GitLab


From 87d37a689cff06ae1c1539abb747d152170c91b1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 12:13:24 -0700
Subject: [PATCH 081/450] Automated g4 rollback of changelist 193369280

PiperOrigin-RevId: 193396206
---
 .../optimizers/constant_folding_test.cc       | 88 +------------------
 1 file changed, 1 insertion(+), 87 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 4b41dae480..36625b68b7 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -520,25 +520,6 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_UnknownOutputShape) {
       EXPECT_EQ("Mul", node.op()) << node.name();
     }
   }
-
-  const std::vector<string> fetch = {"mul_0", "mul_4", "mul_8"};
-  auto x_known_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
-  auto x_partially_unknown_t =
-      GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
-  auto x_unknown_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 7}));
-  auto expected_tensors =
-      EvaluateNodes(item.graph, fetch,
-                    {{"x_known", x_known_t},
-                     {"x_partially_unknown", x_partially_unknown_t},
-                     {"x_unknown", x_unknown_t}});
-  EXPECT_EQ(fetch.size(), expected_tensors.size());
-  auto tensors = EvaluateNodes(output, fetch,
-                               {{"x_known", x_known_t},
-                                {"x_partially_unknown", x_partially_unknown_t},
-                                {"x_unknown", x_unknown_t}});
-  EXPECT_EQ(fetch.size(), tensors.size());
-  for (int i = 0; i < tensors.size(); i++)
-    test::ExpectTensorNear<float>(expected_tensors[i], tensors[i], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
@@ -591,20 +572,6 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
       EXPECT_TRUE(IsControlInput(node.input(1)));
     }
   }
-  const std::vector<string> fetch = {"addn1"};
-  auto x_partially_unknown_t =
-      GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
-  auto x_unknown_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
-  auto expected_tensors =
-      EvaluateNodes(item.graph, fetch,
-                    {{"x_partially_unknown", x_partially_unknown_t},
-                     {"x_unknown", x_unknown_t}});
-  EXPECT_EQ(1, expected_tensors.size());
-  auto tensors = EvaluateNodes(output, fetch,
-                               {{"x_partially_unknown", x_partially_unknown_t},
-                                {"x_unknown", x_unknown_t}});
-  EXPECT_EQ(1, tensors.size());
-  test::ExpectTensorNear<float>(expected_tensors[0], tensors[0], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, CreateConstNodes) {
@@ -1089,20 +1056,6 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) {
     }
   }
   EXPECT_EQ(9, found);
-
-  auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
-  auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 6}));
-  auto v3_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 6}));
-  const std::vector<string> fetch_nodes = {"i1a", "i1b", "i2a", "i2b",
-                                           "i2c", "i3a", "i3b"};
-  auto tensors_expected = EvaluateNodes(
-      item.graph, fetch_nodes, {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
-  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
-  auto tensors = EvaluateNodes(output, fetch_nodes,
-                               {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
-  EXPECT_EQ(fetch_nodes.size(), tensors.size());
-  for (int i = 0; i < fetch_nodes.size(); i++)
-    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
 }
 
 TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN_MultipleOutputs) {
@@ -1935,14 +1888,6 @@ TEST_F(ConstantFoldingTest, Packing) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  const std::vector<string> fetch_nodes = {"i1", "i2"};
-  auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes);
-  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
-  auto tensors = EvaluateNodes(output, fetch_nodes);
-  EXPECT_EQ(fetch_nodes.size(), tensors.size());
-  for (int i = 0; i < fetch_nodes.size(); i++)
-    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
-
   // Make sure that the representation of the folded constant is space
   // efficient: in particular, the whole message should be smaller than 8k
   // (the size needed to naively encode 1000 floats folded twice).
@@ -1978,13 +1923,6 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  std::vector<string> fetch_nodes = {"o1", "o2", "p1", "p2"};
-  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1, 5}));
-  auto g_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1}));
-  auto tensors_expected =
-      EvaluateNodes(item.graph, fetch_nodes, {{"a", a_t}, {"g", g_t}});
-  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
-
   // Run a second time to make sure the optimization is idempotent.
   item.graph.Swap(&output);
   status = optimizer.Optimize(nullptr, item, &output);
@@ -2025,11 +1963,6 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
     }
   }
   EXPECT_EQ(6, found);
-
-  auto tensors = EvaluateNodes(output, fetch_nodes, {{"a", a_t}, {"g", g_t}});
-  EXPECT_EQ(fetch_nodes.size(), tensors.size());
-  for (int i = 0; i < fetch_nodes.size(); i++)
-    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
 }
 
 TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
@@ -2049,11 +1982,6 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  std::vector<string> fetch_nodes = {"o1", "o2"};
-  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
-  auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes, {{"a", a_t}});
-  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
-
   ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -2108,10 +2036,6 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
     }
   }
   EXPECT_EQ(7, found);
-  auto tensors = EvaluateNodes(output, fetch_nodes, {{"a", a_t}});
-  EXPECT_EQ(fetch_nodes.size(), tensors.size());
-  for (int i = 0; i < fetch_nodes.size(); i++)
-    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
 }
 
 TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
@@ -2589,7 +2513,7 @@ TEST_F(ConstantFoldingTest, Enter) {
   value_tensor.AsProtoTensorContent(value.mutable_tensor());
 
   GraphDef& graph = item.graph;
-  AddNode("x", "Placeholder", {}, {{"dtype", type}}, &graph);
+  AddNode("x", "Placeholder", {}, {{"T", type}}, &graph);
   AddNode("c1", "Const", {"^x"}, {{"value", value}, {"dtype", type}}, &graph);
   AddNode("enter1", "Enter", {"x"},
           {{"T", type},
@@ -2615,10 +2539,6 @@ TEST_F(ConstantFoldingTest, Enter) {
   item.fetch.push_back("id3");
   item.fetch.push_back("id4");
 
-  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 7}));
-  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {{"x", x_t}});
-  EXPECT_EQ(item.fetch.size(), tensors_expected.size());
-
   ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -2646,12 +2566,6 @@ TEST_F(ConstantFoldingTest, Enter) {
       EXPECT_EQ("enter3", node.input(0));
     }
   }
-
-  auto tensors = EvaluateNodes(output, item.fetch, {{"x", x_t}});
-  EXPECT_EQ(item.fetch.size(), tensors.size());
-
-  for (int i = 0; i < item.fetch.size(); i++)
-    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, TensorArraySize) {
-- 
GitLab


From fe732eea0138167f105720ce83cc0e3034a19d07 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Wed, 18 Apr 2018 12:26:03 -0700
Subject: [PATCH 082/450] Minor test improvement

PiperOrigin-RevId: 193398068
---
 tensorflow/contrib/data/python/kernel_tests/bucketing_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
index 6002cc73c8..55a56b83a8 100644
--- a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
@@ -61,7 +61,7 @@ class GroupByWindowTest(test.TestCase):
 
       self.assertEqual(len(components), sum(counts))
       num_full_batches = len([c for c in counts if c == 4])
-      self.assertGreaterEqual(num_full_batches, 23)
+      self.assertGreaterEqual(num_full_batches, 24)
       self.assertTrue(all(c == 4 for c in counts[:num_full_batches]))
 
   def testImmediateOutput(self):
-- 
GitLab


From 011740b18b8309bb3126f95b736931d850a83861 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 12:32:53 -0700
Subject: [PATCH 083/450] Create specialized functions in optimized graph for
 each function instantiation context.

PiperOrigin-RevId: 193399263
---
 .../common_runtime/graph_execution_state.cc   |  44 +--
 .../common_runtime/graph_execution_state.h    |   5 +-
 tensorflow/core/grappler/optimizers/BUILD     |   2 +
 .../grappler/optimizers/function_optimizer.cc | 186 +++++++++---
 .../grappler/optimizers/function_optimizer.h  |   9 +
 .../optimizers/function_optimizer_test.cc     | 269 ++++++++++--------
 tensorflow/core/grappler/utils/functions.cc   |  94 ++++--
 tensorflow/core/grappler/utils/functions.h    |  19 ++
 .../core/grappler/utils/functions_test.cc     |  39 ++-
 9 files changed, 468 insertions(+), 199 deletions(-)

diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 6a3e6906a3..642d91e328 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -398,7 +398,8 @@ Status GraphExecutionState::InitBaseGraph(const BuildGraphOptions& options) {
 }
 
 Status GraphExecutionState::OptimizeGraph(
-    const BuildGraphOptions& options, std::unique_ptr<Graph>* optimized_graph) {
+    const BuildGraphOptions& options, std::unique_ptr<Graph>* optimized_graph,
+    std::unique_ptr<FunctionLibraryDefinition>* optimized_flib) {
 #ifndef IS_MOBILE_PLATFORM
   if (session_options_->config.graph_options().place_pruned_graph()) {
     return errors::InvalidArgument("Can't optimize a pruned graph");
@@ -493,9 +494,17 @@ Status GraphExecutionState::OptimizeGraph(
     GraphDef new_graph;
     TF_RETURN_IF_ERROR(grappler::RunMetaOptimizer(
         item, rewrite_options, cpu_device, &cluster, &new_graph));
+
+    // Merge optimized graph function library with an original library.
+    // Optimized graph might have new functions specialized for it's
+    // instantiation context (see Grappler function optimizer).
+    optimized_graph->reset(new Graph(OpRegistry::Global()));
+    optimized_flib->reset(new FunctionLibraryDefinition(OpRegistry::Global(),
+                                                        new_graph.library()));
+    TF_RETURN_IF_ERROR((*optimized_flib)->AddLibrary(*flib_def_));
+
     GraphConstructorOptions opts;
     opts.allow_internal_ops = true;
-    optimized_graph->reset(new Graph(OpRegistry::Global()));
     TF_RETURN_IF_ERROR(
         ConvertGraphDefToGraph(opts, new_graph, optimized_graph->get()));
     // The graph conversion sets the requested device names but not the assigned
@@ -524,18 +533,25 @@ Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
         "Attempted to prune a graph that has not been fully initialized.");
   }
 
-  std::unique_ptr<Graph> ng;
-  Status s = OptimizeGraph(options, &ng);
+  // Grappler optimization might change the structure of a graph itself, and
+  // also it can add/prune functions to/from the library.
+  std::unique_ptr<Graph> optimized_graph;
+  std::unique_ptr<FunctionLibraryDefinition> optimized_flib;
+
+  Status s = OptimizeGraph(options, &optimized_graph, &optimized_flib);
   if (!s.ok()) {
-    // Simply copy the original graph if we couldn't optimize it.
-    ng.reset(new Graph(flib_def_.get()));
-    CopyGraph(*graph_, ng.get());
+    // Simply copy the original graph and the function library if we couldn't
+    // optimize it.
+    optimized_graph.reset(new Graph(flib_def_.get()));
+    CopyGraph(*graph_, optimized_graph.get());
+    optimized_flib.reset(new FunctionLibraryDefinition(*flib_def_));
   }
 
   subgraph::RewriteGraphMetadata rewrite_metadata;
   if (session_options_ == nullptr ||
       !session_options_->config.graph_options().place_pruned_graph()) {
-    TF_RETURN_IF_ERROR(PruneGraph(options, ng.get(), &rewrite_metadata));
+    TF_RETURN_IF_ERROR(
+        PruneGraph(options, optimized_graph.get(), &rewrite_metadata));
   } else {
     // This GraphExecutionState represents a graph that was
     // pruned when this was constructed, so we copy the metadata from
@@ -549,15 +565,11 @@ Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
   CHECK_EQ(options.callable_options.fetch_size(),
            rewrite_metadata.fetch_types.size());
 
-  // Make a fresh copy of the function library for the client graph.
-  std::unique_ptr<FunctionLibraryDefinition> flib(
-      new FunctionLibraryDefinition(*flib_def_));
-
   // TODO(andydavis): Clarify optimization pass requirements around CostModel.
   GraphOptimizationPassOptions optimization_options;
   optimization_options.session_options = session_options_;
-  optimization_options.graph = &ng;
-  optimization_options.flib_def = flib.get();
+  optimization_options.graph = &optimized_graph;
+  optimization_options.flib_def = optimized_flib.get();
   optimization_options.device_set = device_set_;
 
   TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
@@ -567,9 +579,9 @@ Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
   // since the local CostModel used to record its stats is sized by
   // the largest node id.
   std::unique_ptr<ClientGraph> dense_copy(
-      new ClientGraph(std::move(flib), rewrite_metadata.feed_types,
+      new ClientGraph(std::move(optimized_flib), rewrite_metadata.feed_types,
                       rewrite_metadata.fetch_types));
-  CopyGraph(*ng, &dense_copy->graph);
+  CopyGraph(*optimized_graph, &dense_copy->graph);
 
   // TODO(vrv): We should check invariants of the graph here.
 
diff --git a/tensorflow/core/common_runtime/graph_execution_state.h b/tensorflow/core/common_runtime/graph_execution_state.h
index 2154ef5bd3..d44a24c87b 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.h
+++ b/tensorflow/core/common_runtime/graph_execution_state.h
@@ -182,8 +182,9 @@ class GraphExecutionState {
   Status PruneGraph(const BuildGraphOptions& options, Graph* graph,
                     subgraph::RewriteGraphMetadata* out_rewrite_metadata);
 
-  Status OptimizeGraph(const BuildGraphOptions& options,
-                       std::unique_ptr<Graph>* optimized_graph);
+  Status OptimizeGraph(
+      const BuildGraphOptions& options, std::unique_ptr<Graph>* optimized_graph,
+      std::unique_ptr<FunctionLibraryDefinition>* optimized_flib);
 
   GraphDef original_graph_def_;            // Immutable after ctor.
   const DeviceSet* device_set_;            // Not owned
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 3070eb1799..63492e1a7f 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -143,6 +143,8 @@ cc_library(
         ":graph_optimizer",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 6d67ead355..d008a9719f 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -29,65 +29,141 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 
 namespace tensorflow {
 namespace grappler {
 namespace {
 
-class FunctionInliningContext {
+// Mark functions that were created as a result of function specialization.
+constexpr char kGrapplerSpecializedFuncAttr[] = "_GrapplerSpecializedFunc";
+
+constexpr char kNoInlineAttr[] = "_noinline";
+
+bool AttrIsTrue(const FunctionDef& func, const string& attr) {
+  return func.attr().count(attr) != 0 && func.attr().at(attr).b();
+}
+
+bool MarkedSpecialized(const FunctionDef& func) {
+  return AttrIsTrue(func, kGrapplerSpecializedFuncAttr);
+}
+
+bool MarkedNoInline(const FunctionDef& func) {
+  return AttrIsTrue(func, kNoInlineAttr);
+}
+
+// Find unique name for the specialized function. Collision can happen if
+// specialized function is instantiated for the nodes with the same name (e.g.
+// inside function body of two different functions).
+string UniqueSpecializedFunctionName(const FunctionDef& func,
+                                     const NodeDef& func_node,
+                                     const FunctionLibraryDefinition& flib) {
+  using str_util::StringReplace;
+  using strings::StrCat;
+
+  string specialized_name = StrCat(func.signature().name(), "_specialized_for_",
+                                   StringReplace(func_node.name(), "/", "_",
+                                                 /*replace_all*/ true));
+  string unique_name = specialized_name;
+
+  int idx = 0;
+  while (flib.Find(unique_name)) {
+    unique_name = strings::StrCat(specialized_name, "_", ++idx);
+  }
+  return unique_name;
+}
+
+class FunctionOptimizerContext {
  public:
-  explicit FunctionInliningContext(const GrapplerItem& item,
-                                   RewriterConfig::Toggle opt_level)
+  explicit FunctionOptimizerContext(const GrapplerItem& item,
+                                    RewriterConfig::Toggle opt_level)
       : opt_level_(opt_level),
-        functions_(InliningCandidates(item)),
         function_library_(FunctionLibraryDefinition(OpRegistry::Global(),
-                                                    item.graph.library())) {}
+                                                    item.graph.library())) {
+    InitializeInlinedFunctions(item);
+  }
+
+  const FunctionLibraryDefinition& function_library() const {
+    return function_library_;
+  }
 
-  const FunctionLibraryDefinition& FunctionLibrary() const {
+  FunctionLibraryDefinition& mutable_function_library() {
     return function_library_;
   }
 
-  bool HasInlinedFunctions() const { return !functions_.empty(); }
+  bool IsInlinedFunction(const string& name) const {
+    return inlined_functions_.count(name) > 0;
+  }
 
   // Find inlining candidate by name. Return nullptr if not found.
   const FunctionDef* FindInlinedFunction(const string& name) const {
-    auto it = functions_.find(name);
-    if (it != functions_.end()) {
-      return it->second;
-    } else {
-      return nullptr;
-    }
+    return gtl::FindWithDefault(inlined_functions_, name, nullptr);
   }
 
  private:
-  std::unordered_map<string, const FunctionDef*> InliningCandidates(
-      const GrapplerItem& item) const {
-    std::unordered_map<string, const FunctionDef*> functions;
+  void InitializeInlinedFunctions(const GrapplerItem& item) {
+    bool aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
+
     for (const FunctionDef& func : item.graph.library().function()) {
-      // Don't inline functions marked as noinline
-      if (func.attr().count("_noinline") != 0 &&
-          func.attr().at("_noinline").b() &&
-          opt_level_ != RewriterConfig::AGGRESSIVE) {
-        continue;
-      }
       // Can't create IdentityN nodes with no input or output: skip these
       // functions for now.
       if (func.signature().input_arg_size() == 0 ||
           func.signature().output_arg_size() == 0) {
         continue;
       }
-      functions[func.signature().name()] = &func;
+      bool marked_noinline = MarkedNoInline(func);
+      bool marked_specialized = MarkedSpecialized(func);
+
+      if (!marked_specialized && (!marked_noinline || aggressive)) {
+        inlined_functions_[func.signature().name()] = &func;
+      }
     }
-    return functions;
   }
 
   RewriterConfig::Toggle opt_level_;
-  std::unordered_map<string, const FunctionDef*> functions_;
   FunctionLibraryDefinition function_library_;
+  // Functions that can be inlined into optimized graph.
+  std::unordered_map<string, const FunctionDef*> inlined_functions_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(FunctionInliningContext);
+  TF_DISALLOW_COPY_AND_ASSIGN(FunctionOptimizerContext);
 };
 
+Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
+                          FunctionOptimizerContext* ctx,
+                          GraphDef* optimized_graph) {
+  const std::unordered_map<string, AttrValue> func_attr(
+      func_node.attr().begin(), func_node.attr().end());
+
+  const auto& flib = ctx->function_library();
+
+  // Make a GrapplerFunctionItem and immediately convert it back to FunctionDef.
+  GrapplerFunctionItem item;
+  TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  // TODO(ezhulenev): Push down const inputs and known input shapes.
+  FunctionDef specialized;
+  TF_RETURN_IF_ERROR(MakeSpecializedFunctionDef(item, flib, &specialized));
+
+  // Find a name for specialized function.
+  const string specialized_func_name =
+      UniqueSpecializedFunctionName(func, func_node, flib);
+
+  specialized.mutable_signature()->set_name(specialized_func_name);
+  auto* specialized_attr = specialized.mutable_attr();
+  (*specialized_attr)[kGrapplerSpecializedFuncAttr].set_b(true);
+
+  // Add specialized function to the library.
+  TF_RETURN_IF_ERROR(
+      ctx->mutable_function_library().AddFunctionDef(specialized));
+
+  // Add a function call node for the specialized function.
+  NodeDef* specialized_func_node = optimized_graph->add_node();
+  *specialized_func_node = func_node;
+  specialized_func_node->set_op(specialized_func_name);
+
+  return Status::OK();
+}
+
 // Copy input/output argument type to the type_list. Return error if argument
 // type is not explicitly defined, and not specified in function attributes.
 Status CopyArgType(const NodeDef& func_node,
@@ -148,14 +224,14 @@ Status HookInlinedFunctionOutputs(
 }
 
 Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
-                      const FunctionInliningContext& ctx,
+                      const FunctionOptimizerContext& ctx,
                       GraphDef* optimized_graph) {
   const std::unordered_map<string, AttrValue> func_attr(
       func_node.attr().begin(), func_node.attr().end());
 
   GrapplerFunctionItem item;
   Status item_status =
-      MakeGrapplerFunctionItem(func, func_attr, ctx.FunctionLibrary(), &item);
+      MakeGrapplerFunctionItem(func, func_attr, ctx.function_library(), &item);
 
   if (!item_status.ok()) {
     return errors::InvalidArgument("Failed to inline function ", func_node.op(),
@@ -378,39 +454,61 @@ Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
 
 Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                    GraphDef* optimized_graph) {
-  FunctionInliningContext function_inlining_ctx(item, opt_level_);
-
   // Nothing to do here.
-  if (!function_inlining_ctx.HasInlinedFunctions()) {
+  if (item.graph.library().function_size() == 0) {
     *optimized_graph = item.graph;
     return Status::OK();
   }
 
+  FunctionOptimizerContext ctx(item, opt_level_);
   SymbolicGradientEnv env(item.graph.versions().producer(),
                           item.graph.library());
 
+  bool inline_gradients = options_.enable_symbolic_gradient_inlining;
+  bool inline_func = options_.enable_function_inlining;
+  bool specialize_func = options_.enable_function_specialization;
+
   for (const NodeDef& node : item.graph.node()) {
-    if (node.op() == "SymbolicGradient") {
-      TF_RETURN_IF_ERROR(InlineSymbolicGradient(node, &env, optimized_graph));
-      continue;
+    const string func_name = node.op();
+
+    if (func_name == "SymbolicGradient" && inline_gradients) {
+      // Inline symbolic gradients only if the corresponding function is inlined
+      const auto* f_attr = gtl::FindOrNull(node.attr(), "f");
+      string f_name = f_attr != nullptr ? f_attr->func().name() : "";
+      if (ctx.IsInlinedFunction(f_name)) {
+        TF_RETURN_IF_ERROR(InlineSymbolicGradient(node, &env, optimized_graph));
+        continue;
+      }
     }
 
-    const FunctionDef* func =
-        function_inlining_ctx.FindInlinedFunction(node.op());
+    const FunctionDef* func = ctx.function_library().Find(func_name);
     if (func != nullptr) {
-      TF_RETURN_IF_ERROR(
-          InlineFunction(node, *func, function_inlining_ctx, optimized_graph));
-    } else {
-      *optimized_graph->add_node() = node;
+      if (inline_func && ctx.IsInlinedFunction(func_name)) {
+        // Inline function body into the optimized graph}
+        TF_RETURN_IF_ERROR(InlineFunction(node, *func, ctx, optimized_graph));
+        continue;
+      }
+
+      if (specialize_func && IsParametrized(*func)) {
+        // TODO(ezhulenev): Specialize function call if input is a Const or has
+        // a known shape. Const input tensors can be pushed into the function
+        // body and removed from function inputs.
+
+        // Specialize function body for its instantiation attributes and inputs.
+        TF_RETURN_IF_ERROR(
+            SpecializeFunction(node, *func, &ctx, optimized_graph));
+        continue;
+      }
     }
-  }
 
-  // TODO(bsteiner): specialize the implementation of functions that can't be
-  // inlined based on the context in which they're instantiated.
+    // If we reached this point, node was not handled by any of the stages
+    // (inline, specialize), simply add a copy to the graph.
+    *optimized_graph->add_node() = node;
+  }
 
   // TODO(bsteiner): trim the library to remove unused function definitions
   *optimized_graph->mutable_versions() = item.graph.versions();
-  *optimized_graph->mutable_library() = item.graph.library();
+  *optimized_graph->mutable_library() = ctx.function_library().ToProto();
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.h b/tensorflow/core/grappler/optimizers/function_optimizer.h
index b124efe01d..c555fadf83 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.h
@@ -38,7 +38,16 @@ class FunctionOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override;
 
  private:
+  friend class FunctionOptimizerTest;
+
+  struct FunctionOptimizerOptions {
+    bool enable_function_inlining = true;
+    bool enable_function_specialization = true;
+    bool enable_symbolic_gradient_inlining = true;
+  };
+
   RewriterConfig::Toggle opt_level_;
+  FunctionOptimizerOptions options_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index 099fe7caf2..fb006d4868 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -24,92 +24,97 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
-namespace {
 
+namespace {
 constexpr char kDevice[] = "/device:CPU:0";
+}  // namespace
 
 class FunctionOptimizerTest : public GrapplerTest {
  protected:
-  Tensor MakeScalarTensor(float value) {
-    Tensor tensor(DT_FLOAT, {});
-    tensor.scalar<float>()() = value;
-    return tensor;
+  void DisableAll(FunctionOptimizer* optimizer) {
+    optimizer->options_.enable_function_inlining = false;
+    optimizer->options_.enable_function_specialization = false;
+    optimizer->options_.enable_symbolic_gradient_inlining = false;
   }
 
-  Tensor MakeScalarTensor(int value) {
-    Tensor tensor(DT_INT32, {});
-    tensor.scalar<int>()() = value;
-    return tensor;
+  void EnableOnlyFunctionInlining(FunctionOptimizer* optimizer) {
+    DisableAll(optimizer);
+    optimizer->options_.enable_function_inlining = true;
+  }
+
+  void EnableOnlyFunctionSpecialization(FunctionOptimizer* optimizer) {
+    DisableAll(optimizer);
+    optimizer->options_.enable_function_specialization = true;
   }
 };
 
-TEST_F(FunctionOptimizerTest, SimpleFunction) {
+TEST_F(FunctionOptimizerTest, InlineFunction_SimpleFunction) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
   // Build a graph to compute y = XTimesTwo(x)
   GrapplerItem item;
-  constexpr char device[] = "/device:CPU:0";
   item.graph = test::function::GDef(
-      {test::function::NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}},
-                            device),
-       test::function::NDef("y", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, device),
-       test::function::NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, device)},
+      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("y", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, kDevice)},
       // FunctionLib
       {
           test::function::XTimesTwo(),
       });
 
-  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
   int count = 0;
   for (const NodeDef& node : output.node()) {
     if (node.name() == "y/inlined_inputs") {
       count++;
       EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("x", node.input(0));
     } else if (node.name() == "y/x") {
       count++;
       EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y/inlined_inputs:0", node.input(0));
     } else if (node.name() == "y/two") {
       count++;
       EXPECT_EQ("Const", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("^y/inlined_inputs", node.input(0));
     } else if (node.name() == "y/scale") {
       count++;
       EXPECT_EQ("Cast", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
     } else if (node.name() == "y/y") {
       count++;
       EXPECT_EQ("Mul", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("y/x", node.input(0));
       EXPECT_EQ("y/scale", node.input(1));
     } else if (node.name() == "y") {
       count++;
       EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y/y", node.input(0));
     } else if (node.name() == "z") {
       count++;
       EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y", node.input(0));
     }
   }
   EXPECT_EQ(7, count);
 
-  Tensor pi = MakeScalarTensor(3.14f);
+  Tensor pi = test::AsScalar<float>(3.14f);
   item.fetch = {"z"};
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
@@ -118,7 +123,11 @@ TEST_F(FunctionOptimizerTest, SimpleFunction) {
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
-TEST_F(FunctionOptimizerTest, FixedTypeFunction) {
+TEST_F(FunctionOptimizerTest, InlineFunction_FixedTypeFunction) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
   // Create and instantiate a version of the XTimesTwo function that only
   // accepts floats a inputs.
   const Tensor kTwo = test::AsScalar<float>(2.0f);
@@ -137,19 +146,16 @@ TEST_F(FunctionOptimizerTest, FixedTypeFunction) {
           {{"y"}, "Mul", {"x", "two"}, {{"T", DT_FLOAT}}},
       });
 
-  constexpr char device[] = "/device:CPU:0";
   GrapplerItem item;
   item.graph = test::function::GDef(
-      {test::function::NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}},
-                            device),
-       test::function::NDef("y", "XTimesTwo", {"x"}, {}, device),
-       test::function::NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, device)},
+      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("y", "XTimesTwo", {"x"}, {}, kDevice),
+       NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, kDevice)},
       // FunctionLib
       {
           x_times_two,
       });
 
-  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -159,13 +165,13 @@ TEST_F(FunctionOptimizerTest, FixedTypeFunction) {
     if (node.name() == "y/inlined_inputs") {
       count++;
       EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("x", node.input(0));
     } else if (node.name() == "y/x") {
       count++;
       EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y/inlined_inputs:0", node.input(0));
     } else if (node.name() == "y/two") {
@@ -173,31 +179,31 @@ TEST_F(FunctionOptimizerTest, FixedTypeFunction) {
       EXPECT_EQ("Const", node.op());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("^y/inlined_inputs", node.input(0));
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
     } else if (node.name() == "y/y") {
       count++;
       EXPECT_EQ("Mul", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("y/x", node.input(0));
       EXPECT_EQ("y/two", node.input(1));
     } else if (node.name() == "y") {
       count++;
       EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y/y", node.input(0));
     } else if (node.name() == "z") {
       count++;
       EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y", node.input(0));
     }
   }
   EXPECT_EQ(6, count);
 
-  Tensor pi = MakeScalarTensor(3.14f);
+  Tensor pi = test::AsScalar<float>(3.14f);
   item.fetch = {"z"};
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
@@ -206,7 +212,11 @@ TEST_F(FunctionOptimizerTest, FixedTypeFunction) {
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
-TEST_F(FunctionOptimizerTest, FunctionWithOutputMapping) {
+TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithOutputMapping) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
   FunctionDef func = FunctionDefHelper::Create(
       // Name
       "Exp_func",
@@ -223,65 +233,61 @@ TEST_F(FunctionOptimizerTest, FunctionWithOutputMapping) {
       {{"out", "Exp:y:0"}});
 
   GrapplerItem item;
-  constexpr char device[] = "/device:CPU:0";
   item.graph = test::function::GDef(
-      {test::function::NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}},
-                            device),
-       test::function::NDef("y", "Exp_func", {"x"}, {}, device),
-       test::function::NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, device)},
+      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("y", "Exp_func", {"x"}, {}, kDevice),
+       NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, kDevice)},
       // FunctionLib
       {
           func,
       });
 
-  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
   int count = 0;
   for (const NodeDef& node : output.node()) {
     if (node.name() == "y/inlined_inputs") {
       count++;
       EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("x", node.input(0));
     } else if (node.name() == "y/in") {
       count++;
       EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y/inlined_inputs:0", node.input(0));
     } else if (node.name() == "y/Linear_func") {
       count++;
       EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y/in", node.input(0));
     } else if (node.name() == "y/Exp") {
       count++;
       EXPECT_EQ("Exp", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y/Linear_func", node.input(0));
     } else if (node.name() == "y") {
       count++;
       EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y/Exp", node.input(0));
     } else if (node.name() == "z") {
       count++;
       EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y", node.input(0));
     }
   }
   EXPECT_EQ(6, count);
 
-  Tensor pi = MakeScalarTensor(3.14f);
+  Tensor pi = test::AsScalar<float>(3.14f);
   item.fetch = {"z"};
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
@@ -290,7 +296,11 @@ TEST_F(FunctionOptimizerTest, FunctionWithOutputMapping) {
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
-TEST_F(FunctionOptimizerTest, FunctionWithInputForwarding) {
+TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithInputForwarding) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
   FunctionDef func = FunctionDefHelper::Create(
       // Name
       "ForwardInputs",
@@ -306,42 +316,30 @@ TEST_F(FunctionOptimizerTest, FunctionWithInputForwarding) {
       {{"out0", "in0"}, {"arg2", "arg2"}, {"arg3", "arg3"}});
 
   GrapplerItem item;
-  constexpr char device[] = "/device:CPU:0";
   item.graph = test::function::GDef(
-      {test::function::NDef("x0", "Placeholder", {}, {{"dtype", DT_FLOAT}},
-                            device),
-       test::function::NDef("x1", "Placeholder", {}, {{"dtype", DT_FLOAT}},
-                            device),
-       test::function::NDef("x2", "Placeholder", {}, {{"dtype", DT_FLOAT}},
-                            device),
-       test::function::NDef("x3", "Placeholder", {}, {{"dtype", DT_INT32}},
-                            device),
-       test::function::NDef("x4", "Placeholder", {}, {{"dtype", DT_FLOAT}},
-                            device),
-       test::function::NDef("y", "ForwardInputs",
-                            {"x0", "x1", "x2", "x3", "x4"}, {}, device),
-       test::function::NDef("z0", "Identity", {"y:0"}, {{"T", DT_FLOAT}},
-                            device),
-       test::function::NDef("z1", "Identity", {"y:1"}, {{"T", DT_FLOAT}},
-                            device),
-       test::function::NDef("z2", "Identity", {"y:2"}, {{"T", DT_INT32}},
-                            device)},
+      {NDef("x0", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("x1", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("x2", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("x3", "Placeholder", {}, {{"dtype", DT_INT32}}, kDevice),
+       NDef("x4", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("y", "ForwardInputs", {"x0", "x1", "x2", "x3", "x4"}, {}, kDevice),
+       NDef("z0", "Identity", {"y:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("z1", "Identity", {"y:1"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("z2", "Identity", {"y:2"}, {{"T", DT_INT32}}, kDevice)},
       // FunctionLib
       {
           func,
       });
 
-  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
   item.fetch = {"z0", "z1", "z2"};
-  item.feed.emplace_back("x0", MakeScalarTensor(3.14f));
-  item.feed.emplace_back("x1", MakeScalarTensor(2.7f));
-  item.feed.emplace_back("x2", MakeScalarTensor(1.0f));
-  item.feed.emplace_back("x4", MakeScalarTensor(-1.0f));
-  item.feed.emplace_back("x3", MakeScalarTensor(1234));
+  item.feed.emplace_back("x0", test::AsScalar<float>(3.14f));
+  item.feed.emplace_back("x1", test::AsScalar<float>(2.7f));
+  item.feed.emplace_back("x2", test::AsScalar<float>(1.0f));
+  item.feed.emplace_back("x4", test::AsScalar<float>(-1.0f));
+  item.feed.emplace_back("x3", test::AsScalar<int>(1234));
   auto tensors_expected = EvaluateFetchNodes(item);
   GrapplerItem optimized(item, std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
@@ -350,7 +348,12 @@ TEST_F(FunctionOptimizerTest, FunctionWithInputForwarding) {
   test::ExpectTensorEqual<int>(tensors_expected[2], tensors[2]);
 }
 
-TEST_F(FunctionOptimizerTest, FunctionWithoutInput) {
+TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithoutInput) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+  EnableOnlyFunctionInlining(&optimizer);
+
   const Tensor kTwo = test::AsScalar<int64>(2);
   FunctionDef func = FunctionDefHelper::Define(
       // Name
@@ -366,25 +369,26 @@ TEST_F(FunctionOptimizerTest, FunctionWithoutInput) {
        {{"o"}, "Cast", {"two"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}}});
 
   GrapplerItem item;
-  constexpr char device[] = "/device:CPU:0";
   item.graph = test::function::GDef(
-      {test::function::NDef("y", "GenerateTwo", {}, {}, device),
-       test::function::NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, device)},
+      {NDef("y", "GenerateTwo", {}, {}, kDevice),
+       NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, kDevice)},
       // FunctionLib
       {
           func,
       });
 
-  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
   // For now we won't inline the function.
   EXPECT_EQ(item.graph.DebugString(), output.DebugString());
 }
 
-TEST_F(FunctionOptimizerTest, InlineFunctionWithNestedFunctionCall) {
+TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithNestedFunctionCall) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
   // Define square via function library:
   //   MySquare(x) = MyMul(x, x)
 
@@ -402,17 +406,13 @@ TEST_F(FunctionOptimizerTest, InlineFunctionWithNestedFunctionCall) {
 
   GrapplerItem item;
   item.graph = test::function::GDef(
-      {test::function::NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}},
-                            kDevice),
-       test::function::NDef("square", "MySquare", {"a"}, {{"T", DT_FLOAT}},
-                            kDevice),
-       test::function::NDef("outputs", "Identity", {"square:0"},
-                            {{"T", DT_FLOAT}}, kDevice)},
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("square", "MySquare", {"a"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("outputs", "Identity", {"square:0"}, {{"T", DT_FLOAT}}, kDevice)},
       // FunctionLib
       {mul_func, square_func});
 
   GraphDef output;
-  FunctionOptimizer optimizer(RewriterConfig::ON);
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
   int count = 0;
@@ -469,7 +469,7 @@ TEST_F(FunctionOptimizerTest, InlineFunctionWithNestedFunctionCall) {
   EXPECT_EQ(9, count);
 
   item.fetch = {"outputs"};
-  item.feed.emplace_back("a", MakeScalarTensor(2.0f));
+  item.feed.emplace_back("a", test::AsScalar<float>(2.0f));
   auto tensors_expected = EvaluateFetchNodes(item);
 
   GrapplerItem optimized(item, std::move(output));
@@ -478,7 +478,9 @@ TEST_F(FunctionOptimizerTest, InlineFunctionWithNestedFunctionCall) {
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
-TEST_F(FunctionOptimizerTest, SymbolicGradients) {
+TEST_F(FunctionOptimizerTest, InlineSymbolicGradient_TestFunc) {
+  FunctionOptimizer optimizer(RewriterConfig::ON);
+
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
   FunctionDef func = FunctionDefHelper::Define(
@@ -508,10 +510,8 @@ TEST_F(FunctionOptimizerTest, SymbolicGradients) {
   TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
   *item.graph.mutable_library()->add_function() = func;
 
-  FunctionOptimizer optimizer(RewriterConfig::ON);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
   std::vector<Tensor> expected =
       EvaluateNodes(item.graph, {"out1", "out2"}, {});
@@ -520,7 +520,9 @@ TEST_F(FunctionOptimizerTest, SymbolicGradients) {
   test::ExpectTensorEqual<float>(expected[1], optimized[1]);
 }
 
-TEST_F(FunctionOptimizerTest, SymbolicGradientsIdentity) {
+TEST_F(FunctionOptimizerTest, InlineSymbolicGradient_IdentityFunc) {
+  FunctionOptimizer optimizer(RewriterConfig::ON);
+
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
   FunctionDef func = FunctionDefHelper::Create(
@@ -550,10 +552,8 @@ TEST_F(FunctionOptimizerTest, SymbolicGradientsIdentity) {
   TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
   *item.graph.mutable_library()->add_function() = func;
 
-  FunctionOptimizer optimizer(RewriterConfig::ON);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
   EXPECT_EQ(13, output.node_size());
   EXPECT_EQ("Const", output.node(0).name());
@@ -583,7 +583,9 @@ TEST_F(FunctionOptimizerTest, SymbolicGradientsIdentity) {
   test::ExpectTensorEqual<float>(expected[0], optimized[0]);
 }
 
-TEST_F(FunctionOptimizerTest, SymbolicGradientsNoInlineFunc) {
+TEST_F(FunctionOptimizerTest, InlineSymbolicGradient_NoInlineFunc) {
+  FunctionOptimizer optimizer(RewriterConfig::ON);
+
   FunctionDef func = FunctionDefHelper::Define(
       "TestFunc", {"x:float", "y:float"}, {"l:float"}, {},
       {
@@ -613,7 +615,6 @@ TEST_F(FunctionOptimizerTest, SymbolicGradientsNoInlineFunc) {
   TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
   *item.graph.mutable_library()->add_function() = func;
 
-  FunctionOptimizer optimizer(RewriterConfig::ON);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   // The optimizer should succeed but the graphs should be the same.
@@ -621,6 +622,52 @@ TEST_F(FunctionOptimizerTest, SymbolicGradientsNoInlineFunc) {
   CompareGraphs(item.graph, output);
 }
 
-}  // namespace
+TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+  EnableOnlyFunctionSpecialization(&optimizer);
+
+  // Mark XTimesTwo as noinline
+  FunctionDef x_times_two = test::function::XTimesTwo();
+  (*x_times_two.mutable_attr())["_noinline"].set_b(true);
+  std::vector<FunctionDef> function_library = {x_times_two};
+
+  // Build a graph to compute y = XTimesTwo(x)
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("y", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, kDevice)},
+      function_library);
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  // Make sure that specialized function was added to the library
+  EXPECT_EQ(2, output.library().function_size());
+  EXPECT_EQ("XTimesTwo_specialized_for_y",
+            output.library().function(1).signature().name());
+
+  // And 'y' node is calling specialized function
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "y" && count++) {
+      EXPECT_EQ("XTimesTwo_specialized_for_y", node.op());
+    }
+  }
+  EXPECT_EQ(1, count);
+
+  // And that graph evaluation yields the same result
+  Tensor pi = test::AsScalar<float>(3.14f);
+  item.fetch = {"z"};
+  item.feed.emplace_back("x", pi);
+
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index e8d423a759..638fe1999a 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -33,23 +33,22 @@ namespace grappler {
 
 namespace {
 
-Status OutputNameRange(const FunctionLibraryDefinition& flib,
-                       const NodeDef& node,
-                       tensorflow::NameRangeMap* outputs_range_map) {
-  const OpRegistrationData* registration;
-  TF_RETURN_IF_ERROR(flib.LookUp(node.op(), &registration));
-  TF_RETURN_IF_ERROR(tensorflow::NameRangesForNode(node, registration->op_def,
-                                                   nullptr, outputs_range_map));
+Status RegisterFunctionBodyOutputs(const OpRegistrationData& registration,
+                                   const NodeDef& node,
+                                   GrapplerFunctionConnectivity* connectivity) {
+  tensorflow::NameRangeMap outputs_range_map;
+  TF_RETURN_IF_ERROR(tensorflow::NameRangesForNode(
+      node, registration.op_def, nullptr, &outputs_range_map));
+  connectivity->RegisterFunctionBodyOutputs(node.name(), outputs_range_map);
   return Status::OK();
 }
 
 Status RegisterFunctionBodyOutputs(const FunctionLibraryDefinition& flib,
                                    const NodeDef& node,
                                    GrapplerFunctionConnectivity* connectivity) {
-  tensorflow::NameRangeMap outputs_range_map;
-  TF_RETURN_IF_ERROR(OutputNameRange(flib, node, &outputs_range_map));
-  connectivity->RegisterFunctionBodyOutputs(node.name(), outputs_range_map);
-  return Status::OK();
+  const OpRegistrationData* registration;
+  TF_RETURN_IF_ERROR(flib.LookUp(node.op(), &registration));
+  return RegisterFunctionBodyOutputs(*registration, node, connectivity);
 }
 
 // Replace the placeholder attribute values with the values specified in
@@ -306,26 +305,35 @@ GrapplerFunctionItem::GrapplerFunctionItem(
     const string& func_name, const AttrValueMap& func_attr,
     const std::vector<InputArgExpansion>& input_arg_expansions,
     const std::vector<OutputArgExpansion>& output_arg_expansions,
+    const std::vector<string>& keep_nodes, bool is_stateful,
     GraphDef&& function_body)
     : func_attr_(func_attr),
       input_arg_expansions_(input_arg_expansions),
-      output_arg_expansions_(output_arg_expansions) {
+      output_arg_expansions_(output_arg_expansions),
+      is_stateful_(is_stateful) {
   id = func_name;
-  // Fill the feed nodes with input placeholders
+  keep_ops = keep_nodes;
+  // Swap the graph body.
+  graph.Swap(&function_body);
+  // Fill the feed nodes with input placeholders.
   for (const InputArgExpansion& input_arg : input_arg_expansions_) {
     for (const string& placeholder : input_arg.placeholders) {
       feed.emplace_back(placeholder, Tensor());
       input_arg_placeholders_.insert(placeholder);
     }
   }
-  // Fill the fetch nodes with outputs
+  // Fill the fetch nodes with outputs.
   for (const OutputArgExpansion& output_arg : output_arg_expansions_) {
     for (const string& output_tensor : output_arg.output_tensors) {
       fetch.push_back(output_tensor);
     }
   }
-  // Swap the graph body
-  graph.Swap(&function_body);
+  // Stateful and Send (it's not stateful) nodes must be preserved in the graph.
+  for (const NodeDef& node : graph.node()) {
+    if (IsSend(node)) {
+      keep_ops.push_back(node.name());
+    }
+  }
 }
 
 const std::vector<InputArgExpansion>& GrapplerFunctionItem::inputs() const {
@@ -365,6 +373,8 @@ const GraphDef& GrapplerFunctionItem::function_body() const { return graph; }
 
 GraphDef& GrapplerFunctionItem::mutable_function_body() { return graph; }
 
+bool GrapplerFunctionItem::is_stateful() const { return is_stateful_; }
+
 GrapplerFunctionItem& GrapplerFunctionItem::SwapFunctionBody(GraphDef&& other) {
   graph.Swap(&other);
   return *this;
@@ -380,6 +390,33 @@ std::vector<string> OutputTensors(const GrapplerFunctionItem& item) {
   return output_tensors;
 }
 
+bool HasParametrizedType(const FunctionDef& func) {
+  const auto is_type_parametrized = [](const OpDef::ArgDef& arg) {
+    return !arg.type_attr().empty() || !arg.number_attr().empty() ||
+           !arg.type_list_attr().empty();
+  };
+
+  const auto& input = func.signature().input_arg();
+  const auto& output = func.signature().output_arg();
+  return std::any_of(input.begin(), input.end(), is_type_parametrized) ||
+         std::any_of(output.begin(), output.end(), is_type_parametrized);
+}
+
+bool HasParametrizedBody(const FunctionDef& func) {
+  const auto is_parametrized = [&](const NodeDef& node) {
+    for (const auto& attr : node.attr()) {
+      if (!attr.second.placeholder().empty()) return true;
+    }
+    return false;
+  };
+  return std::any_of(func.node_def().begin(), func.node_def().end(),
+                     is_parametrized);
+}
+
+bool IsParametrized(const FunctionDef& func) {
+  return HasParametrizedType(func) || HasParametrizedBody(func);
+}
+
 Status MakeGrapplerFunctionItem(const FunctionDef& func,
                                 const AttrValueMap& func_instantiation_attr,
                                 const FunctionLibraryDefinition& flib,
@@ -408,6 +445,7 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 
   std::vector<InputArgExpansion> inputs;
   std::vector<OutputArgExpansion> outputs;
+  std::vector<string> keep_nodes;
 
   // Function body shares the library with the graph that instantiated it.
   GraphDef function_body;
@@ -444,6 +482,7 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 
     InputArgExpansion input_expansion{/*input_name=*/input.name(),
                                       /*data_type=*/input_data_type,
+                                      /*is_ref*/ input.is_ref(),
                                       /*placeholders=*/{input.name()}};
     connectivity.RegisterInputArgExpansion(input_expansion);
     inputs.push_back(input_expansion);
@@ -454,12 +493,21 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
     NodeDef* new_node = function_body.add_node();
     *new_node = func_def_node;
 
+    const OpRegistrationData* registration;
+    TF_RETURN_IF_ERROR(flib.LookUp(func_def_node.op(), &registration));
+
     // Resolve all placeholder values using function instantiation attributes.
     TF_RETURN_IF_ERROR(ResolveFunctionBodyNodeAttrPlaceholders(
         func_instantiation_attr, new_node));
+
     // Register node output range in a function connectivity.
-    TF_RETURN_IF_ERROR(
-        RegisterFunctionBodyOutputs(flib, func_def_node, &connectivity));
+    TF_RETURN_IF_ERROR(RegisterFunctionBodyOutputs(*registration, func_def_node,
+                                                   &connectivity));
+
+    // Stateful and Send nodes must be preserved in a function body
+    if (registration->op_def.is_stateful() || IsSend(func_def_node)) {
+      keep_nodes.push_back(func_def_node.name());
+    }
   }
 
   // Rewrite inputs to use GraphDef format
@@ -483,19 +531,22 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 
     OutputArgExpansion output{/*output_name=*/out.name(),
                               /*data_type=*/output_data_type,
+                              /*is_ref=*/out.is_ref(),
                               /*output_tensors=*/output_tensors};
     outputs.push_back(output);
   }
 
+  bool is_stateful = signature.is_stateful();
+
   *item = GrapplerFunctionItem(
       /*func_name=*/signature.name(),
       /*func_attr=*/AttrValueMap(func.attr().begin(), func.attr().end()),
-      inputs, outputs, std::move(function_body));
+      inputs, outputs, keep_nodes, is_stateful, std::move(function_body));
   return Status::OK();
 }
 
 // Register GrapplerFunctionItem input arg expansion and function body outputs
-// in the GrapplerFunctionConnectivity
+// in the GrapplerFunctionConnectivity.
 Status RegisterGrapplerFunctionConnectivity(
     const GrapplerFunctionItem& item, const FunctionLibraryDefinition& flib,
     GrapplerFunctionConnectivity* connectivity) {
@@ -513,6 +564,7 @@ Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item,
                                   const FunctionLibraryDefinition& flib,
                                   FunctionDef* func) {
   func->mutable_signature()->set_name(item.id);
+  func->mutable_signature()->set_is_stateful(item.is_stateful());
 
   // Build a GrapplerFunctionConnectivity from inputs and new function body.
   GrapplerFunctionConnectivity connectivity;
@@ -524,6 +576,7 @@ Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item,
     OpDef::ArgDef arg_def;
     arg_def.set_name(input_arg.input_name);
     arg_def.set_type(input_arg.data_type);
+    arg_def.set_is_ref(input_arg.is_ref);
     *func->mutable_signature()->add_input_arg() = arg_def;
   }
 
@@ -532,6 +585,7 @@ Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item,
     OpDef::ArgDef arg_def;
     arg_def.set_name(output_arg.output_name);
     arg_def.set_type(output_arg.data_type);
+    arg_def.set_is_ref(output_arg.is_ref);
     *func->mutable_signature()->add_output_arg() = arg_def;
 
     CHECK(output_arg.output_tensors.size() == 1)  // do some sanity checking
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index 2ac3917a66..ab369bcad7 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -41,6 +41,7 @@ struct InputArgExpansion {
   // different data types
   string input_name;                 // name of the function input argument
   DataType data_type;                // input data type
+  bool is_ref;                       // if true, inputs are required to be refs
   std::vector<string> placeholders;  // names of placeholder nodes in the
                                      // function body
 };
@@ -55,6 +56,7 @@ struct OutputArgExpansion {
   // different data types
   string output_name;                  // name of the function output argument
   DataType data_type;                  // output data type
+  bool is_ref;                         // if true, outputs are refs
   std::vector<string> output_tensors;  // names of output tensor from the
                                        // function body nodes
 };
@@ -136,6 +138,7 @@ class GrapplerFunctionItem : public GrapplerItem {
       const string& func_name, const AttrValueMap& func_attr,
       const std::vector<InputArgExpansion>& input_arg_expansions,
       const std::vector<OutputArgExpansion>& output_arg_expansions,
+      const std::vector<string>& keep_nodes, bool is_stateful,
       GraphDef&& function_body);
 
   bool IsInputPlaceholder(const string& node_name) const;
@@ -152,6 +155,8 @@ class GrapplerFunctionItem : public GrapplerItem {
   const GraphDef& function_body() const;
   GraphDef& mutable_function_body();
 
+  bool is_stateful() const;
+
   GrapplerFunctionItem& SwapFunctionBody(GraphDef&& other);
 
  private:
@@ -162,11 +167,25 @@ class GrapplerFunctionItem : public GrapplerItem {
   std::vector<OutputArgExpansion> output_arg_expansions_;
 
   std::set<string> input_arg_placeholders_;
+
+  bool is_stateful_;
 };
 
 // Return all output tensors referenced by item output args.
 std::vector<string> OutputTensors(const GrapplerFunctionItem& item);
 
+// Check if function input/output types are fully defined only at instantiation
+// time (parametrized by it's instantiation node).
+bool HasParametrizedType(const FunctionDef& func);
+
+// Check if a function body is parametrized by it's instantiation node. Function
+// body is parametrized, if it has at least one node with a 'placeholder'
+// attribute.
+bool HasParametrizedBody(const FunctionDef& func);
+
+// Check if function has parametrized type or body.
+bool IsParametrized(const FunctionDef& func);
+
 // Make a GrapplerFunctionItem from the function definition and attributes.
 // Return error if the given function def cannot be converted.
 Status MakeGrapplerFunctionItem(
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index a9a708bf67..54d235a8a4 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -30,12 +30,37 @@ namespace {
 
 class FunctionsTest : public ::testing::Test {};
 
+TEST_F(FunctionsTest, IsParametrized) {
+  // Function is defined for multiple input types.
+  FunctionDef parametrized_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  // Function is defined just for float inputs.
+  FunctionDef non_parametrized_func = FunctionDefHelper::Create(
+      "MyMul", {"x:float", "y:float"}, {"z:float"}, {},
+      {{{"output"}, "Mul", {"x", "y"}, {{"T", DT_FLOAT}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  EXPECT_TRUE(HasParametrizedType(parametrized_func));
+  EXPECT_TRUE(HasParametrizedBody(parametrized_func));
+  EXPECT_TRUE(IsParametrized(parametrized_func));
+
+  EXPECT_FALSE(HasParametrizedType(non_parametrized_func));
+  EXPECT_FALSE(HasParametrizedBody(non_parametrized_func));
+  EXPECT_FALSE(IsParametrized(non_parametrized_func));
+}
+
 TEST_F(FunctionsTest, GrapplerFunctionConnectivity_ExpandFunctionDefInput) {
   GrapplerFunctionConnectivity connectivity;
 
-  connectivity.RegisterInputArgExpansion({"inputA", DT_FLOAT, {"inputA"}});
   connectivity.RegisterInputArgExpansion(
-      {"inputB", DT_FLOAT, {"inputB_0", "inputB_1"}});
+      {"inputA", DT_FLOAT, /*is_ref=*/false, {"inputA"}});
+  connectivity.RegisterInputArgExpansion(
+      {"inputB", DT_FLOAT, /*is_ref=*/false, {"inputB_0", "inputB_1"}});
 
   connectivity.RegisterFunctionBodyOutputs("Add", {{"z", {0, 1}}});
   connectivity.RegisterFunctionBodyOutputs("Func",
@@ -98,9 +123,10 @@ TEST_F(FunctionsTest, GrapplerFunctionConnectivity_ExpandFunctionDefInput) {
 TEST_F(FunctionsTest, GrapplerFunctionConnectivity_AsFunctionDefInput) {
   GrapplerFunctionConnectivity connectivity;
 
-  connectivity.RegisterInputArgExpansion({"inputA", DT_FLOAT, {"inputA"}});
   connectivity.RegisterInputArgExpansion(
-      {"inputB", DT_FLOAT, {"inputB_0", "inputB_1"}});
+      {"inputA", DT_FLOAT, /*is_ref=*/false, {"inputA"}});
+  connectivity.RegisterInputArgExpansion(
+      {"inputB", DT_FLOAT, /*is_ref=*/false, {"inputB_0", "inputB_1"}});
 
   connectivity.RegisterFunctionBodyOutputs("Add", {{"z", {0, 1}}});
   connectivity.RegisterFunctionBodyOutputs("Func",
@@ -136,9 +162,10 @@ TEST_F(FunctionsTest, GrapplerFunctionConnectivity_AsFunctionDefInput) {
 TEST_F(FunctionsTest, GrapplerFunctionConnectivity_ExpandNodeInputs) {
   GrapplerFunctionConnectivity connectivity;
 
-  connectivity.RegisterInputArgExpansion({"inputA", DT_FLOAT, {"inputA"}});
   connectivity.RegisterInputArgExpansion(
-      {"inputB", DT_FLOAT, {"inputB_0", "inputB_1"}});
+      {"inputA", DT_FLOAT, /*is_ref=*/false, {"inputA"}});
+  connectivity.RegisterInputArgExpansion(
+      {"inputB", DT_FLOAT, /*is_ref=*/false, {"inputB_0", "inputB_1"}});
 
   NodeDef node;
   node.add_input("inputA:0");
-- 
GitLab


From 8b1c3049028d1c25d7f4acc3af794918d64aafdf Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Wed, 18 Apr 2018 12:51:56 -0700
Subject: [PATCH 084/450] Moving all state (variables) required for
 _EmbeddingColumn and _SharedEmbeddingColumn into a base.Layer

PiperOrigin-RevId: 193401873
---
 .../python/feature_column/feature_column.py   | 337 +++++++++---------
 .../feature_column/feature_column_test.py     | 280 +++++++--------
 2 files changed, 293 insertions(+), 324 deletions(-)

diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index f9201a4794..0ad8131599 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -135,6 +135,7 @@ import numpy as np
 import six
 
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
@@ -462,6 +463,16 @@ def linear_model(features,
     return predictions
 
 
+def _add_to_collections(var, weight_collections):
+  # TODO(rohanj): Explore adding a _get_variable_list method on `Variable`
+  # so that we don't have to do this check.
+  if isinstance(var, variables.PartitionedVariable):
+    for constituent_var in list(var):
+      ops.add_to_collections(weight_collections, constituent_var)
+  else:
+    ops.add_to_collections(weight_collections, var)
+
+
 class _FCLinearWrapper(base.Layer):
   """Wraps a _FeatureColumn in a layer for use in a linear model.
 
@@ -482,12 +493,8 @@ class _FCLinearWrapper(base.Layer):
     self._units = units
     self._sparse_combiner = sparse_combiner
     self._weight_collections = weight_collections
-    self._state = {}
 
   def build(self, _):
-    self._state = self._feature_column._create_state(  # pylint: disable=protected-access
-        self._weight_collections, self.add_variable)
-
     if isinstance(self._feature_column, _CategoricalColumn):
       weight = self.add_variable(
           name='weights',
@@ -501,7 +508,7 @@ class _FCLinearWrapper(base.Layer):
           shape=[num_elements, self._units],
           initializer=init_ops.zeros_initializer(),
           trainable=self.trainable)
-    ops.add_to_collections(self._weight_collections, weight)
+    _add_to_collections(weight, self._weight_collections)
     self._weight_var = weight
     self.built = True
 
@@ -513,8 +520,7 @@ class _FCLinearWrapper(base.Layer):
         sparse_combiner=self._sparse_combiner,
         weight_collections=self._weight_collections,
         trainable=self.trainable,
-        weight_var=self._weight_var,
-        state=self._state)
+        weight_var=self._weight_var)
     return weighted_sum
 
 
@@ -538,7 +544,7 @@ class _BiasLayer(base.Layer):
         shape=[self._units],
         initializer=init_ops.zeros_initializer(),
         trainable=self.trainable)
-    ops.add_to_collections(self._weight_collections, self._bias_variable)
+    _add_to_collections(self._bias_variable, self._weight_collections)
     self.built = True
 
   def call(self, _):
@@ -806,11 +812,22 @@ def embedding_column(
     initializer = init_ops.truncated_normal_initializer(
         mean=0.0, stddev=1 / math.sqrt(dimension))
 
+  embedding_shape = categorical_column._num_buckets, dimension  # pylint: disable=protected-access
+
+  def _creator(weight_collections, scope):
+    embedding_column_layer = _EmbeddingColumnLayer(
+        embedding_shape=embedding_shape,
+        initializer=initializer,
+        weight_collections=weight_collections,
+        trainable=trainable,
+        name='embedding_column_layer')
+    return embedding_column_layer(None, scope=scope)  # pylint: disable=not-callable
+
   return _EmbeddingColumn(
       categorical_column=categorical_column,
       dimension=dimension,
       combiner=combiner,
-      initializer=initializer,
+      layer_creator=_creator,
       ckpt_to_load_from=ckpt_to_load_from,
       tensor_name_in_ckpt=tensor_name_in_ckpt,
       max_norm=max_norm,
@@ -933,6 +950,7 @@ def shared_embedding_columns(
   sorted_columns = sorted(categorical_columns, key=lambda x: x.name)
 
   c0 = sorted_columns[0]
+  num_buckets = c0._num_buckets  # pylint: disable=protected-access
   if not isinstance(c0, _CategoricalColumn):
     raise ValueError(
         'All categorical_columns must be subclasses of _CategoricalColumn. '
@@ -948,23 +966,45 @@ def shared_embedding_columns(
           'the same type, or be weighted_categorical_column of the same type. '
           'Given column: {} of type: {} does not match given column: {} of '
           'type: {}'.format(c0, type(c0), c, type(c)))
+    if num_buckets != c._num_buckets:  # pylint: disable=protected-access
+      raise ValueError(
+          'To use shared_embedding_column, all categorical_columns must have '
+          'the same number of buckets. Given column: {} with buckets: {} does  '
+          'not match column: {} with buckets: {}'.format(
+              c0, num_buckets, c, c._num_buckets))  # pylint: disable=protected-access
 
   if not shared_embedding_collection_name:
     shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns)
     shared_embedding_collection_name += '_shared_embedding'
 
+  # Create the state (_SharedEmbeddingColumnLayer) here.
+  embedding_shape = num_buckets, dimension
+
+  shared_embedding_column_layer = _EmbeddingColumnLayer(
+      embedding_shape=embedding_shape,
+      initializer=initializer,
+      weight_collections=[],
+      trainable=trainable,
+      name=shared_embedding_collection_name)
+
   result = []
   for column in categorical_columns:
-    result.append(_SharedEmbeddingColumn(
-        categorical_column=column,
-        dimension=dimension,
-        combiner=combiner,
-        initializer=initializer,
-        shared_embedding_collection_name=shared_embedding_collection_name,
-        ckpt_to_load_from=ckpt_to_load_from,
-        tensor_name_in_ckpt=tensor_name_in_ckpt,
-        max_norm=max_norm,
-        trainable=trainable))
+    result.append(
+        _SharedEmbeddingColumn(
+            categorical_column=column,
+            initializer=initializer,
+            dimension=dimension,
+            combiner=combiner,
+            var_scope_name=shared_embedding_collection_name,
+            ckpt_to_load_from=ckpt_to_load_from,
+            tensor_name_in_ckpt=tensor_name_in_ckpt,
+            max_norm=max_norm,
+            trainable=trainable))
+
+  for single_result in result:
+    single_result._set_layer(shared_embedding_column_layer)  # pylint: disable=protected-access
+    single_result._set_all_columns(result)  # pylint: disable=protected-access
+
   return result
 
 
@@ -1721,6 +1761,57 @@ def crossed_column(keys, hash_bucket_size, hash_key=None):
       hash_key=hash_key)
 
 
+# TODO(rohanj): Clearly define semantics of this layer.
+class _EmbeddingColumnLayer(base.Layer):
+  """A layer that stores all the state required for a embedding column."""
+
+  def __init__(self,
+               embedding_shape,
+               initializer,
+               weight_collections=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    """Constructor.
+
+    Args:
+      embedding_shape: Shape of the embedding variable used for lookup.
+      initializer: A variable initializer function to be used in embedding
+        variable initialization. If not specified, defaults to
+        `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
+        `1/sqrt(dimension)`.
+      weight_collections: A list of collection names to which the Variable will
+        be added. Note that, variables will also be added to collections
+        `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
+      trainable: If `True` also add the variable to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: Name of the layer
+      **kwargs: keyword named properties.
+    """
+    super(_EmbeddingColumnLayer, self).__init__(
+        trainable=trainable, name=name, **kwargs)
+    self._embedding_shape = embedding_shape
+    self._initializer = initializer
+    self._weight_collections = weight_collections
+
+  def build(self, _):
+    self._embedding_weight_var = self.add_variable(
+        name='embedding_weights',
+        shape=self._embedding_shape,
+        dtype=dtypes.float32,
+        initializer=self._initializer,
+        trainable=self.trainable)
+    # self.add_variable already appends to GLOBAL_VARIABLES collection.
+    if self._weight_collections and not context.executing_eagerly():
+      for weight_collection in self._weight_collections:
+        if weight_collection != ops.GraphKeys.GLOBAL_VARIABLES:
+          _add_to_collections(self._embedding_weight_var, [weight_collection])
+    self.built = True
+
+  def call(self, _):
+    return self._embedding_weight_var
+
+
 class _FeatureColumn(object):
   """Represents a feature column abstraction.
 
@@ -1794,18 +1885,13 @@ class _FeatureColumn(object):
     """
     pass
 
-  def _create_state(self, weight_collections=None, creator=None):
-    """Returns an object that captures the state of the column.
+  def _reset_config(self):
+    """Resets the configuration in the column.
 
-    Args:
-      weight_collections: Collections to add the variable to
-      creator: Variable creator method called, if provided.
-
-    Returns:
-      An object that encapsulates the state of the column. Can return None.
+    Some feature columns e.g. embedding or shared embedding columns might
+    have some state that is needed to be reset sometimes. Use this method
+    in that scenario.
     """
-    del weight_collections, creator  # Unused
-    return None
 
 
 class _DenseColumn(_FeatureColumn):
@@ -1826,11 +1912,7 @@ class _DenseColumn(_FeatureColumn):
     pass
 
   @abc.abstractmethod
-  def _get_dense_tensor(self,
-                        inputs,
-                        weight_collections=None,
-                        trainable=None,
-                        state=None):
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     """Returns a `Tensor`.
 
     The output of this function will be used by model-builder-functions. For
@@ -1848,9 +1930,6 @@ class _DenseColumn(_FeatureColumn):
         will be created) are added.
       trainable: If `True` also add variables to the graph collection
         `GraphKeys.TRAINABLE_VARIABLES` (see @{tf.Variable}).
-      state: An object encapsulating the state of the column. Columns that
-        create state using the _create_state method would have that state
-        passed in to this method.
 
     Returns:
       `Tensor` of shape [batch_size] + `_variable_shape`.
@@ -1864,8 +1943,7 @@ def _create_weighted_sum(column,
                          sparse_combiner,
                          weight_collections,
                          trainable,
-                         weight_var=None,
-                         state=None):
+                         weight_var=None):
   """Creates a weighted sum for a dense or sparse column for linear_model."""
   if isinstance(column, _CategoricalColumn):
     return _create_categorical_column_weighted_sum(
@@ -1883,8 +1961,7 @@ def _create_weighted_sum(column,
         units=units,
         weight_collections=weight_collections,
         trainable=trainable,
-        weight_var=weight_var,
-        state=state)
+        weight_var=weight_var)
 
 
 def _create_dense_column_weighted_sum(column,
@@ -1892,20 +1969,12 @@ def _create_dense_column_weighted_sum(column,
                                       units,
                                       weight_collections,
                                       trainable,
-                                      weight_var=None,
-                                      state=None):
+                                      weight_var=None):
   """Create a weighted sum of a dense column for linear_model."""
-  if state is not None:
-    tensor = column._get_dense_tensor(  # pylint: disable=protected-access
-        builder,
-        weight_collections=weight_collections,
-        trainable=trainable,
-        state=state)
-  else:
-    tensor = column._get_dense_tensor(  # pylint: disable=protected-access
-        builder,
-        weight_collections=weight_collections,
-        trainable=trainable)
+  tensor = column._get_dense_tensor(  # pylint: disable=protected-access
+      builder,
+      weight_collections=weight_collections,
+      trainable=trainable)
   num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
   batch_size = array_ops.shape(tensor)[0]
   tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
@@ -2368,10 +2437,10 @@ class _BucketizedColumn(_DenseColumn, _CategoricalColumn,
 
 class _EmbeddingColumn(
     _DenseColumn, _SequenceDenseColumn,
-    collections.namedtuple('_EmbeddingColumn', (
-        'categorical_column', 'dimension', 'combiner', 'initializer',
-        'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'
-    ))):
+    collections.namedtuple(
+        '_EmbeddingColumn',
+        ('categorical_column', 'dimension', 'combiner', 'layer_creator',
+         'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'))):
   """See `embedding_column`."""
 
   @property
@@ -2393,33 +2462,10 @@ class _EmbeddingColumn(
       self._shape = tensor_shape.vector(self.dimension)
     return self._shape
 
-  def _create_state(self, weight_collections=None, creator=None):
-    variables_map = {}
-    embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
-    if creator is not None:
-      embedding_weights = creator(
-          name='embedding_weights',
-          shape=embedding_shape,
-          dtype=dtypes.float32,
-          initializer=self.initializer,
-          trainable=self.trainable)
-      ops.add_to_collections(weight_collections, embedding_weights)
-    else:
-      embedding_weights = variable_scope.get_variable(
-          name='embedding_weights',
-          shape=embedding_shape,
-          dtype=dtypes.float32,
-          initializer=self.initializer,
-          trainable=self.trainable,
-          collections=weight_collections)
-    variables_map['embedding_weights'] = embedding_weights
-    return variables_map
-
   def _get_dense_tensor_internal(self,
                                  inputs,
                                  weight_collections=None,
-                                 trainable=None,
-                                 state=None):
+                                 trainable=None):
     """Private method that follows the signature of _get_dense_tensor."""
     # Get sparse IDs and weights.
     sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
@@ -2427,9 +2473,9 @@ class _EmbeddingColumn(
     sparse_ids = sparse_tensors.id_tensor
     sparse_weights = sparse_tensors.weight_tensor
 
-    if state is None:
-      state = self._create_state(weight_collections)
-    embedding_weights = state['embedding_weights']
+    embedding_weights = self.layer_creator(
+        weight_collections=weight_collections,
+        scope=variable_scope.get_variable_scope())
 
     if self.ckpt_to_load_from is not None:
       to_restore = embedding_weights
@@ -2448,11 +2494,7 @@ class _EmbeddingColumn(
         name='%s_weights' % self.name,
         max_norm=self.max_norm)
 
-  def _get_dense_tensor(self,
-                        inputs,
-                        weight_collections=None,
-                        trainable=None,
-                        state=None):
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     if isinstance(self.categorical_column, _SequenceCategoricalColumn):
       raise ValueError(
           'In embedding_column: {}. '
@@ -2467,8 +2509,7 @@ class _EmbeddingColumn(
     return self._get_dense_tensor_internal(
         inputs=inputs,
         weight_collections=weight_collections,
-        trainable=trainable,
-        state=state)
+        trainable=trainable)
 
   def _get_sequence_dense_tensor(
       self, inputs, weight_collections=None, trainable=None):
@@ -2492,13 +2533,20 @@ class _EmbeddingColumn(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
 
 
+def _get_graph_for_variable(var):
+  if isinstance(var, variables.PartitionedVariable):
+    return list(var)[0].graph
+  else:
+    return var.graph
+
+
 class _SharedEmbeddingColumn(
     _DenseColumn,
-    collections.namedtuple('_SharedEmbeddingColumn', (
-        'categorical_column', 'dimension', 'combiner', 'initializer',
-        'shared_embedding_collection_name', 'ckpt_to_load_from',
-        'tensor_name_in_ckpt', 'max_norm', 'trainable'
-    ))):
+    collections.namedtuple(
+        '_SharedEmbeddingColumn',
+        ('categorical_column', 'dimension', 'combiner', 'initializer',
+         'var_scope_name', 'ckpt_to_load_from', 'tensor_name_in_ckpt',
+         'max_norm', 'trainable'))):
   """See `embedding_column`."""
 
   @property
@@ -2509,7 +2557,7 @@ class _SharedEmbeddingColumn(
 
   @property
   def _var_scope_name(self):
-    return self.shared_embedding_collection_name
+    return self.var_scope_name
 
   @property
   def _parse_example_spec(self):
@@ -2518,45 +2566,29 @@ class _SharedEmbeddingColumn(
   def _transform_feature(self, inputs):
     return inputs.get(self.categorical_column)
 
+  def _set_layer(self, layer):
+    self._layer = layer
+
+  def _set_all_columns(self, all_columns):
+    self._all_columns = all_columns
+
+  def _reset_config(self):
+    config = self._layer.get_config()
+    config['embedding_shape'] = (
+        self.categorical_column._num_buckets,  # pylint: disable=protected-access
+        self.dimension)
+    config['initializer'] = self.initializer
+    self._layer = self._layer.__class__.from_config(config)
+    for column in self._all_columns:
+      column._set_layer(self._layer)  # pylint: disable=protected-access
+
   @property
   def _variable_shape(self):
     if not hasattr(self, '_shape'):
       self._shape = tensor_shape.vector(self.dimension)
     return self._shape
 
-  def _create_state(self, weight_collections=None, creator=None):
-    variables_map = {}
-    shared_embedding_collection = ops.get_collection(
-        self.shared_embedding_collection_name)
-    if not shared_embedding_collection:
-      embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
-      if creator is not None:
-        embedding_weights = creator(
-            name='embedding_weights',
-            shape=embedding_shape,
-            dtype=dtypes.float32,
-            initializer=self.initializer,
-            trainable=self.trainable)
-        ops.add_to_collections(weight_collections, embedding_weights)
-      else:
-        embedding_weights = variable_scope.get_variable(
-            name='embedding_weights',
-            shape=embedding_shape,
-            dtype=dtypes.float32,
-            initializer=self.initializer,
-            trainable=self.trainable,
-            collections=weight_collections)
-      ops.add_to_collection(self.shared_embedding_collection_name,
-                            embedding_weights)
-      variables_map['embedding_weights'] = embedding_weights
-
-    return variables_map
-
-  def _get_dense_tensor(self,
-                        inputs,
-                        weight_collections=None,
-                        trainable=None,
-                        state=None):
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     # This method is called from a variable_scope with name _var_scope_name,
     # which is shared among all shared embeddings. Open a name_scope here, so
     # that the ops for different columns have distinct names.
@@ -2567,38 +2599,17 @@ class _SharedEmbeddingColumn(
       sparse_ids = sparse_tensors.id_tensor
       sparse_weights = sparse_tensors.weight_tensor
 
-      embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
-      shared_embedding_collection = ops.get_collection(
-          self.shared_embedding_collection_name)
-      if shared_embedding_collection:
-        if len(shared_embedding_collection) > 1:
-          raise ValueError(
-              'Collection {} can only contain one variable. '
-              'Suggested fix A: Choose a unique name for this collection. '
-              'Suggested fix B: Do not add any variables to this collection. '
-              'The feature_column library already adds a variable under the '
-              'hood.'.format(shared_embedding_collection))
-        embedding_weights = shared_embedding_collection[0]
-        if embedding_weights.get_shape() != embedding_shape:
-          raise ValueError(
-              'Shared embedding collection {} contains variable {} of '
-              'unexpected shape {}. Expected shape is {}. '
-              'Suggested fix A: Choose a unique name for this collection. '
-              'Suggested fix B: Do not add any variables to this collection. '
-              'The feature_column library already adds a variable under the '
-              'hood.'.format(
-                  self.shared_embedding_collection_name, embedding_weights.name,
-                  embedding_weights.get_shape(), embedding_shape))
-      else:
-        embedding_weights = variable_scope.get_variable(
-            name='embedding_weights',
-            shape=embedding_shape,
-            dtype=dtypes.float32,
-            initializer=self.initializer,
-            trainable=self.trainable and trainable,
-            collections=weight_collections)
-        ops.add_to_collection(
-            self.shared_embedding_collection_name, embedding_weights)
+      embedding_weights = self._layer(
+          None, scope=variable_scope.get_variable_scope())
+      # If we're in graph mode and this is called with a different graph,
+      # then we should reset.
+      if not context.executing_eagerly() and (
+          ops.get_default_graph() !=
+          _get_graph_for_variable(embedding_weights)):
+        self._reset_config()
+        embedding_weights = self._layer(
+            None, scope=variable_scope.get_variable_scope())
+
       if self.ckpt_to_load_from is not None:
         to_restore = embedding_weights
         if isinstance(to_restore, variables.PartitionedVariable):
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 62718db0e5..46404abadc 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -2885,6 +2885,114 @@ class FunctionalInputLayerTest(test.TestCase):
                 features['price2']: [[1.], [5.]],
             })
 
+  def test_multiple_layers_with_same_embedding_column(self):
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
+        'sparse_feature', hash_bucket_size=5)
+    some_embedding_column = fc.embedding_column(
+        some_sparse_column, dimension=10)
+
+    with ops.Graph().as_default():
+      features = {
+          'sparse_feature': [['a'], ['x']],
+      }
+      all_cols = [some_embedding_column]
+      fc.input_layer(features, all_cols)
+      fc.input_layer(features, all_cols)
+      # Make sure that 2 variables get created in this case.
+      self.assertEqual(2, len(
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+      expected_var_names = [
+          'input_layer/sparse_feature_embedding/embedding_weights:0',
+          'input_layer_1/sparse_feature_embedding/embedding_weights:0'
+      ]
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+
+  def test_multiple_layers_with_same_shared_embedding_column(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
+        [categorical_column_b, categorical_column_a],
+        dimension=embedding_dimension)
+
+    with ops.Graph().as_default():
+      features = {
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+      all_cols = [embedding_column_a, embedding_column_b]
+      fc.input_layer(features, all_cols)
+      fc.input_layer(features, all_cols)
+      # Make sure that only 1 variable gets created in this case.
+      self.assertEqual(1, len(
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+      self.assertItemsEqual(
+          ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+
+  def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
+        [categorical_column_b, categorical_column_a],
+        dimension=embedding_dimension)
+    all_cols = [embedding_column_a, embedding_column_b]
+
+    with ops.Graph().as_default():
+      features = {
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+      fc.input_layer(features, all_cols)
+      # Make sure that only 1 variable gets created in this case.
+      self.assertEqual(1, len(
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+
+    with ops.Graph().as_default():
+      features1 = {
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+
+      fc.input_layer(features1, all_cols)
+      # Make sure that only 1 variable gets created in this case.
+      self.assertEqual(1, len(
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+      self.assertItemsEqual(
+          ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+
   def test_with_numpy_input_fn(self):
     embedding_values = (
         (1., 2., 3., 4., 5.),  # id 0
@@ -4504,7 +4612,6 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertIs(categorical_column, embedding_column.categorical_column)
     self.assertEqual(embedding_dimension, embedding_column.dimension)
     self.assertEqual('mean', embedding_column.combiner)
-    self.assertIsNotNone(embedding_column.initializer)
     self.assertIsNone(embedding_column.ckpt_to_load_from)
     self.assertIsNone(embedding_column.tensor_name_in_ckpt)
     self.assertIsNone(embedding_column.max_norm)
@@ -4529,7 +4636,6 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertIs(categorical_column, embedding_column.categorical_column)
     self.assertEqual(embedding_dimension, embedding_column.dimension)
     self.assertEqual('my_combiner', embedding_column.combiner)
-    self.assertEqual('my_initializer', embedding_column.initializer())
     self.assertEqual('my_ckpt', embedding_column.ckpt_to_load_from)
     self.assertEqual('my_ckpt_tensor', embedding_column.tensor_name_in_ckpt)
     self.assertEqual(42., embedding_column.max_norm)
@@ -4560,7 +4666,6 @@ class EmbeddingColumnTest(test.TestCase):
 
       self.assertEqual(embedding_dimension, embedding_column.dimension)
       self.assertEqual('my_combiner', embedding_column.combiner)
-      self.assertEqual('my_initializer', embedding_column.initializer())
       self.assertEqual('my_ckpt', embedding_column.ckpt_to_load_from)
       self.assertEqual('my_ckpt_tensor', embedding_column.tensor_name_in_ckpt)
       self.assertEqual(42., embedding_column.max_norm)
@@ -4675,72 +4780,6 @@ class EmbeddingColumnTest(test.TestCase):
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, embedding_lookup.eval())
 
-  def test_get_dense_tensor_with_state(self):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer)
-
-    # Create embedding_weights variable.
-    weight_collections = [
-        ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.MODEL_VARIABLES
-    ]
-    state = embedding_column._create_state(weight_collections)
-
-    # Provide sparse input and get dense result.
-    embedding_lookup = embedding_column._get_dense_tensor(
-        _LazyBuilder({
-            'aaa': sparse_input
-        }), state=state)
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
-
   def test_get_dense_tensor_3d(self):
     # Inputs.
     vocabulary_size = 4
@@ -4795,8 +4834,8 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, embedding_lookup.eval())
@@ -4823,8 +4862,9 @@ class EmbeddingColumnTest(test.TestCase):
         }), weight_collections=('my_vars',))
 
     # Assert expected embedding variable and lookups.
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     my_vars = ops.get_collection('my_vars')
     self.assertItemsEqual(
         ('embedding_weights:0',), tuple([v.name for v in my_vars]))
@@ -5243,14 +5283,12 @@ class SharedEmbeddingColumnTest(test.TestCase):
     self.assertEqual(embedding_dimension, embedding_column_b.dimension)
     self.assertEqual('mean', embedding_column_a.combiner)
     self.assertEqual('mean', embedding_column_b.combiner)
-    self.assertIsNotNone(embedding_column_a.initializer)
-    self.assertIsNotNone(embedding_column_b.initializer)
     self.assertIsNone(embedding_column_a.ckpt_to_load_from)
     self.assertIsNone(embedding_column_b.ckpt_to_load_from)
     self.assertEqual('aaa_bbb_shared_embedding',
-                     embedding_column_a.shared_embedding_collection_name)
+                     embedding_column_a.var_scope_name)
     self.assertEqual('aaa_bbb_shared_embedding',
-                     embedding_column_b.shared_embedding_collection_name)
+                     embedding_column_b.var_scope_name)
     self.assertIsNone(embedding_column_a.tensor_name_in_ckpt)
     self.assertIsNone(embedding_column_b.tensor_name_in_ckpt)
     self.assertIsNone(embedding_column_a.max_norm)
@@ -5296,12 +5334,10 @@ class SharedEmbeddingColumnTest(test.TestCase):
     self.assertEqual(embedding_dimension, embedding_column_b.dimension)
     self.assertEqual('my_combiner', embedding_column_a.combiner)
     self.assertEqual('my_combiner', embedding_column_b.combiner)
-    self.assertEqual('my_initializer', embedding_column_a.initializer())
-    self.assertEqual('my_initializer', embedding_column_b.initializer())
     self.assertEqual('shared_embedding_collection_name',
-                     embedding_column_a.shared_embedding_collection_name)
+                     embedding_column_a.var_scope_name)
     self.assertEqual('shared_embedding_collection_name',
-                     embedding_column_b.shared_embedding_collection_name)
+                     embedding_column_b.var_scope_name)
     self.assertEqual('my_ckpt', embedding_column_a.ckpt_to_load_from)
     self.assertEqual('my_ckpt', embedding_column_b.ckpt_to_load_from)
     self.assertEqual('my_ckpt_tensor', embedding_column_a.tensor_name_in_ckpt)
@@ -5351,9 +5387,8 @@ class SharedEmbeddingColumnTest(test.TestCase):
 
       self.assertEqual(embedding_dimension, embedding_column_a.dimension)
       self.assertEqual('my_combiner', embedding_column_a.combiner)
-      self.assertEqual('my_initializer', embedding_column_a.initializer())
       self.assertEqual('shared_embedding_collection_name',
-                       embedding_column_a.shared_embedding_collection_name)
+                       embedding_column_a.var_scope_name)
       self.assertEqual('my_ckpt', embedding_column_a.ckpt_to_load_from)
       self.assertEqual('my_ckpt_tensor', embedding_column_a.tensor_name_in_ckpt)
       self.assertEqual(42., embedding_column_a.max_norm)
@@ -5537,80 +5572,6 @@ class SharedEmbeddingColumnTest(test.TestCase):
       self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
       self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
 
-  def test_get_dense_tensor_with_state(self):
-    # Inputs.
-    vocabulary_size = 3
-    # -1 values are ignored.
-    input_a = np.array([
-        [2, -1, -1],  # example 0, ids [2]
-        [0, 1, -1]
-    ])  # example 1, ids [0, 1]
-    input_b = np.array([
-        [0, -1, -1],  # example 0, ids [0]
-        [-1, -1, -1]
-    ])  # example 1, ids []
-    input_features = {'aaa': input_a, 'bbb': input_b}
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups_a = (
-        # example 0:
-        (7., 11.),  # ids [2], embedding = [7, 11]
-        # example 1:
-        (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-    )
-    expected_lookups_b = (
-        # example 0:
-        (1., 2.),  # ids [0], embedding = [1, 2]
-        # example 1:
-        (0., 0.),  # ids [], embedding = [0, 0]
-    )
-
-    # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer)
-
-    # Create state.
-    weight_collections = [
-        ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.MODEL_VARIABLES
-    ]
-    state = embedding_column_a._create_state(weight_collections)
-
-    # Provide sparse input and get dense result.
-    embedding_lookup_a = embedding_column_a._get_dense_tensor(
-        _LazyBuilder(input_features), state=state)
-    embedding_lookup_b = embedding_column_b._get_dense_tensor(
-        _LazyBuilder(input_features), state=state)
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
-    embedding_var = global_vars[0]
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, embedding_var.eval())
-      self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
-      self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
-
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
     vocabulary_size = 3
@@ -5912,10 +5873,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
           tuple([v.name for v in trainable_vars]))
     else:
       self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
-    shared_embedding_vars = ops.get_collection('aaa_bbb_shared_embedding')
-    self.assertItemsEqual(
-        ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
-        tuple([v.name for v in shared_embedding_vars]))
+    shared_embedding_vars = global_vars
     with _initialized_session():
       self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval())
       self.assertAllEqual(expected_lookups, input_layer.eval())
-- 
GitLab


From f4c6a318eb9eb01440c313a4fc423ac267fdb74e Mon Sep 17 00:00:00 2001
From: Stanley Bileschi <bileschi@google.com>
Date: Wed, 18 Apr 2018 13:12:04 -0700
Subject: [PATCH 085/450] Improves error messaging for bad (empty) CSV files.

PiperOrigin-RevId: 193404804
---
 tensorflow/contrib/data/python/ops/readers.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index 4ec8ae1c79..bbb808fbd7 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -156,12 +156,21 @@ def _infer_column_names(filenames, field_delim, use_quote_delim):
       "quoting": csv.QUOTE_MINIMAL if use_quote_delim else csv.QUOTE_NONE
   }
   with file_io.FileIO(filenames[0], "r") as f:
-    column_names = next(csv.reader(f, **csv_kwargs))
+    try:
+      column_names = next(csv.reader(f, **csv_kwargs))
+    except StopIteration:
+      raise ValueError(("Received StopIteration when reading the header line "
+                        "of %s.  Empty file?") % filenames[0])
 
   for name in filenames[1:]:
     with file_io.FileIO(name, "r") as f:
-      if next(csv.reader(f, **csv_kwargs)) != column_names:
-        raise ValueError("Files have different column names in the header row.")
+      try:
+        if next(csv.reader(f, **csv_kwargs)) != column_names:
+          raise ValueError(
+              "Files have different column names in the header row.")
+      except StopIteration:
+        raise ValueError(("Received StopIteration when reading the header line "
+                          "of %s.  Empty file?") % filenames[0])
   return column_names
 
 
-- 
GitLab


From f28342c2caab42987e6761abeca84ba3147cddba Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 13:13:02 -0700
Subject: [PATCH 086/450] Prevent access to deallocated hash map upon exit().

PiperOrigin-RevId: 193404950
---
 tensorflow/core/lib/strings/numbers.cc | 33 ++++++++++++++++----------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index 8f34baa7de..c296daa95d 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -33,19 +33,26 @@ namespace tensorflow {
 
 namespace {
 
+template <typename T>
+const std::unordered_map<string, T>* GetSpecialNumsSingleton() {
+  static const std::unordered_map<string, T>* special_nums =
+      CHECK_NOTNULL((new const std::unordered_map<string, T>{
+          {"inf", std::numeric_limits<T>::infinity()},
+          {"+inf", std::numeric_limits<T>::infinity()},
+          {"-inf", -std::numeric_limits<T>::infinity()},
+          {"infinity", std::numeric_limits<T>::infinity()},
+          {"+infinity", std::numeric_limits<T>::infinity()},
+          {"-infinity", -std::numeric_limits<T>::infinity()},
+          {"nan", std::numeric_limits<T>::quiet_NaN()},
+          {"+nan", std::numeric_limits<T>::quiet_NaN()},
+          {"-nan", -std::numeric_limits<T>::quiet_NaN()},
+      }));
+  return special_nums;
+}
+
 template <typename T>
 T locale_independent_strtonum(const char* str, const char** endptr) {
-  static const std::unordered_map<string, T> special_nums = {
-      {"inf", std::numeric_limits<T>::infinity()},
-      {"+inf", std::numeric_limits<T>::infinity()},
-      {"-inf", -std::numeric_limits<T>::infinity()},
-      {"infinity", std::numeric_limits<T>::infinity()},
-      {"+infinity", std::numeric_limits<T>::infinity()},
-      {"-infinity", -std::numeric_limits<T>::infinity()},
-      {"nan", std::numeric_limits<T>::quiet_NaN()},
-      {"+nan", std::numeric_limits<T>::quiet_NaN()},
-      {"-nan", -std::numeric_limits<T>::quiet_NaN()},
-  };
+  auto special_nums = GetSpecialNumsSingleton<T>();
   std::stringstream s(str);
 
   // Check if str is one of the special numbers.
@@ -57,8 +64,8 @@ T locale_independent_strtonum(const char* str, const char** endptr) {
         std::tolower(special_num_str[i], std::locale::classic());
   }
 
-  auto entry = special_nums.find(special_num_str);
-  if (entry != special_nums.end()) {
+  auto entry = special_nums->find(special_num_str);
+  if (entry != special_nums->end()) {
     *endptr = str + (s.eof() ? static_cast<std::iostream::pos_type>(strlen(str))
                              : s.tellg());
     return entry->second;
-- 
GitLab


From 6fa949afca5f1549f87554475d053c608f0da379 Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Wed, 18 Apr 2018 13:29:43 -0700
Subject: [PATCH 087/450] Catch OSError in gen_git_source.py subprocess call.

OSError occurs if git cannot be found. This is a initial fix for some
Windows build errors.

PiperOrigin-RevId: 193407250
---
 tensorflow/tools/git/gen_git_source.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index cbcdbf5b80..78d511969e 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -125,7 +125,7 @@ def configure(src_base_path, gen_path, debug=False):
       try:
         # In python 3.5, symlink function exists even on Windows. But requires
         # Windows Admin privileges, otherwise an OSError will be thrown.
-        if hasattr(os, 'symlink'):
+        if hasattr(os, "symlink"):
           os.symlink(src, os.path.join(gen_path, target))
         else:
           shutil.copy2(src, os.path.join(gen_path, target))
@@ -162,7 +162,7 @@ def get_git_version(git_base_path):
         str("--work-tree=" + git_base_path), "describe", "--long", "--tags"
     ]).strip())
     return val if val else unknown_label
-  except subprocess.CalledProcessError:
+  except (subprocess.CalledProcessError, OSError):
     return unknown_label
 
 
-- 
GitLab


From 9fe297ffa8133309fe548df3a0208d0ff9305a66 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Wed, 18 Apr 2018 13:46:37 -0700
Subject: [PATCH 088/450] Internal-only change.

PiperOrigin-RevId: 193409980
---
 tensorflow/compiler/tests/BUILD                  |  2 +-
 .../contrib/data/python/kernel_tests/BUILD       | 15 ++++++++++++---
 tensorflow/contrib/distributions/BUILD           |  1 +
 tensorflow/contrib/estimator/BUILD               |  4 ++--
 tensorflow/contrib/legacy_seq2seq/BUILD          |  5 ++++-
 tensorflow/contrib/linalg/BUILD                  | 10 ++++++++--
 tensorflow/contrib/lookup/BUILD                  |  2 +-
 tensorflow/contrib/optimizer_v2/BUILD            |  1 +
 tensorflow/core/BUILD                            | 16 +++++++++++++++-
 tensorflow/examples/tutorials/mnist/BUILD        |  1 +
 tensorflow/python/BUILD                          |  4 ++--
 tensorflow/python/estimator/BUILD                |  1 +
 tensorflow/python/keras/BUILD                    |  2 +-
 tensorflow/python/kernel_tests/BUILD             |  7 +++++--
 tensorflow/python/kernel_tests/linalg/BUILD      |  1 +
 tensorflow/tools/docs/BUILD                      |  3 ++-
 16 files changed, 58 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index b9e42ca677..46b86c53aa 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -340,7 +340,7 @@ tf_xla_py_test(
 
 tf_xla_py_test(
     name = "ftrl_test",
-    size = "small",
+    size = "medium",
     srcs = ["ftrl_test.py"],
     deps = [
         ":xla_test",
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index b475c9fa6b..c554607960 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -122,7 +122,10 @@ py_test(
     size = "small",
     srcs = ["filter_dataset_op_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+        "optonly",
+    ],
     deps = [
         ":dataset_serialization_test",
         "//tensorflow/python:array_ops",
@@ -211,7 +214,10 @@ py_test(
     size = "medium",
     srcs = ["map_dataset_op_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+        "optonly",
+    ],
     deps = [
         ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:error_ops",
@@ -306,7 +312,10 @@ py_test(
     srcs = ["resample_test.py"],
     shard_count = 2,
     srcs_version = "PY2AND3",
-    tags = ["noasan"],
+    tags = [
+        "noasan",
+        "optonly",
+    ],
     deps = [
         "//tensorflow/contrib/data/python/ops:resampling",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 20e432b88d..2d99e8172d 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -877,6 +877,7 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
+    tags = ["optonly"],
 )
 
 cuda_py_test(
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index 9f4cd44afb..9e88bc7de1 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -210,7 +210,7 @@ py_library(
 
 py_test(
     name = "head_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/estimator/head_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -250,7 +250,7 @@ py_library(
 
 py_test(
     name = "linear_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/estimator/linear_test.py"],
     srcs_version = "PY2AND3",
     tags = [
diff --git a/tensorflow/contrib/legacy_seq2seq/BUILD b/tensorflow/contrib/legacy_seq2seq/BUILD
index 8c2c4fd29c..4ce91a140f 100644
--- a/tensorflow/contrib/legacy_seq2seq/BUILD
+++ b/tensorflow/contrib/legacy_seq2seq/BUILD
@@ -58,5 +58,8 @@ cuda_py_tests(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
-    tags = ["noasan"],  # times out b/63678675
+    tags = [
+        "noasan",  # times out b/63678675
+        "optonly",  # times out (flaky)
+    ],
 )
diff --git a/tensorflow/contrib/linalg/BUILD b/tensorflow/contrib/linalg/BUILD
index 2c5fa7af89..2e92ad6eb3 100644
--- a/tensorflow/contrib/linalg/BUILD
+++ b/tensorflow/contrib/linalg/BUILD
@@ -59,7 +59,10 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
     shard_count = 5,
-    tags = ["noasan"],
+    tags = [
+        "noasan",
+        "optonly",
+    ],
 )
 
 cuda_py_test(
@@ -78,5 +81,8 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
     shard_count = 8,
-    tags = ["noasan"],
+    tags = [
+        "noasan",
+        "optonly",
+    ],
 )
diff --git a/tensorflow/contrib/lookup/BUILD b/tensorflow/contrib/lookup/BUILD
index f616207d46..e3928a82a2 100644
--- a/tensorflow/contrib/lookup/BUILD
+++ b/tensorflow/contrib/lookup/BUILD
@@ -28,7 +28,7 @@ py_library(
 
 tf_py_test(
     name = "lookup_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["lookup_ops_test.py"],
     additional_deps = [
         ":lookup_py",
diff --git a/tensorflow/contrib/optimizer_v2/BUILD b/tensorflow/contrib/optimizer_v2/BUILD
index 86e5f4a437..85cfce346c 100644
--- a/tensorflow/contrib/optimizer_v2/BUILD
+++ b/tensorflow/contrib/optimizer_v2/BUILD
@@ -203,4 +203,5 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
     ],
+    tags = ["optonly"],
 )
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 01bda8e09b..21f929894c 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2759,7 +2759,6 @@ tf_cc_tests(
         "lib/monitoring/sampler_test.cc",
         "lib/random/distribution_sampler_test.cc",
         "lib/random/philox_random_test.cc",
-        "lib/random/random_distributions_test.cc",
         "lib/random/random_test.cc",
         "lib/random/simple_philox_test.cc",
         "lib/strings/base64_test.cc",
@@ -2789,6 +2788,21 @@ tf_cc_tests(
     ],
 )
 
+tf_cc_test(
+    name = "lib_random_random_distributions_test",
+    srcs = ["lib/random/random_distributions_test.cc"],
+    tags = ["optonly"],
+    deps = [
+        ":lib",
+        ":lib_internal",
+        ":lib_test_internal",
+        ":protos_all_cc",
+        ":test",
+        ":test_main",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_cc_test(
     name = "platform_env_test",
     size = "small",
diff --git a/tensorflow/examples/tutorials/mnist/BUILD b/tensorflow/examples/tutorials/mnist/BUILD
index aa1b2ec2db..d7bc6a5a7d 100644
--- a/tensorflow/examples/tutorials/mnist/BUILD
+++ b/tensorflow/examples/tutorials/mnist/BUILD
@@ -51,6 +51,7 @@ py_binary(
         "fully_connected_feed.py",
     ],
     srcs_version = "PY2AND3",
+    tags = ["optonly"],
     deps = [
         ":input_data",
         ":mnist",
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 569d3eb2ce..c2bedab4f9 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2802,7 +2802,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "image_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["ops/image_ops_test.py"],
     additional_deps = [
         ":array_ops",
@@ -4333,7 +4333,7 @@ py_test(
 
 tf_py_test(
     name = "input_test",
-    size = "small",
+    size = "medium",
     srcs = ["training/input_test.py"],
     additional_deps = [
         ":array_ops",
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 7bf4447491..c6bb9b9be7 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -205,6 +205,7 @@ py_test(
         "no_pip",
         "noasan",  # test flakily times out in asan mode.
         "notsan",  # b/67510291
+        "optonly",  # flakily times out in fastbuild
     ],
     deps = [
         ":baseline",
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 024a8cd3d1..ca7686b1d1 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -835,7 +835,7 @@ py_test(
 
 py_test(
     name = "saving_test",
-    size = "small",
+    size = "medium",
     srcs = ["_impl/keras/engine/saving_test.py"],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 11adb1ccfc..a02783e7e7 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -592,7 +592,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "matrix_solve_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["matrix_solve_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1603,7 +1603,10 @@ cuda_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
-    tags = ["noasan"],
+    tags = [
+        "noasan",
+        "optonly",
+    ],
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index 9555e51099..4e3f24890b 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -123,6 +123,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
     shard_count = 5,
+    tags = ["optonly"],
 )
 
 cuda_py_test(
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 0c1fd0cf9d..58b5ef8345 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -103,10 +103,11 @@ py_test(
     data = ["//tensorflow:docs_src"],
     srcs_version = "PY2AND3",
     tags = [
-        # No reason to run sanitizers for this test.
+        # No reason to run sanitizers or fastbuild for this test.
         "noasan",
         "nomsan",
         "notsan",
+        "optonly",
     ],
     deps = [
         ":generate_lib",
-- 
GitLab


From f17311fa8d2df24e56deaab743cdf1ec5e12c692 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 18 Apr 2018 13:47:17 -0700
Subject: [PATCH 089/450] Use the new gather HLO in the bridge when lowering TF
 gather ops; NFC

After gather expansion this should boil down to a while loop very similar to
what we emit from the bridge today.

PiperOrigin-RevId: 193410095
---
 .../compiler/tf2xla/kernels/gather_op.cc      | 189 ++++++++----------
 .../tf2xla/kernels/gather_op_helpers.h        |   4 +-
 2 files changed, 90 insertions(+), 103 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 7945c05af4..0b79cb0916 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -29,52 +29,54 @@ namespace tensorflow {
 Status XlaGather(const xla::ComputationDataHandle& input,
                  const TensorShape& input_shape,
                  const xla::ComputationDataHandle& indices,
-                 TensorShape indices_shape, int64 axis, bool indices_are_nd,
-                 DataType dtype, DataType index_type,
+                 const TensorShape& indices_shape, int64 axis,
+                 bool indices_are_nd, DataType dtype, DataType index_type,
                  xla::ComputationBuilder* builder,
                  xla::ComputationDataHandle* gather_output) {
+  // There is no deep reason why we need this precondition, but this is the only
+  // combination that is used and tested today.
+  CHECK(!indices_are_nd || axis == 0);
+
+  // num_index_dims is the number of components in each index in the indices
+  // tensor.
+  //
+  // num_indices is the total number of (n dimensional or scalar) indices in the
+  // indices tensor.
+  //
   // If the indices are N-dimensional, then the minor dimension of indices
   // should be of size N and correspond to the N indices.
-  int64 num_index_dims = 1;
+  int64 num_index_dims;
+  int64 num_indices = 1;
   if (indices_are_nd) {
     CHECK_GE(indices_shape.dims(), 1);
     num_index_dims = indices_shape.dim_size(indices_shape.dims() - 1);
-    indices_shape.RemoveLastDims(1);
+    for (int64 i = 0, e = indices_shape.dims() - 1; i < e; i++) {
+      num_indices *= indices_shape.dim_size(i);
+    }
+  } else {
+    num_index_dims = 1;
+    for (int64 i = 0, e = indices_shape.dims(); i < e; i++) {
+      num_indices *= indices_shape.dim_size(i);
+    }
   }
 
-  // Although the indices Tensor is flattened into rank 1 during the lookup,
-  // and each scalar entry is used as an index into the first dimension of the
-  // input, the output is returned with shape:
-  // input.shape[:axis] + indices.shape + input.shape[axis+1:]
-
-  const int64 num_indices = indices_shape.num_elements();
-  TensorShape input_shape_pre_axis(input_shape);
-  input_shape_pre_axis.RemoveDimRange(axis, input_shape.dims());
-  TensorShape input_shape_post_axis(input_shape);
-  input_shape_post_axis.RemoveDimRange(0, axis + num_index_dims);
-  // Each slice of the input tensor has shape:
-  // [<input_shape_pre_axis>, 1, ..., 1, <input shape_post_axis>]
-  TensorShape slice_shape(input_shape);
-  for (int64 i = 0; i < num_index_dims; ++i) {
-    slice_shape.set_dim(axis + i, 1);
-  }
+  // Degenerate case: empty indices.
+  if (num_indices == 0) {
+    TensorShape input_shape_pre_axis{input_shape};
+    input_shape_pre_axis.RemoveDimRange(axis, input_shape.dims());
+    TensorShape input_shape_post_axis{input_shape};
+    input_shape_post_axis.RemoveDimRange(0, axis + num_index_dims);
 
-  TensorShape loop_out_shape;
-  loop_out_shape.AppendShape(input_shape_pre_axis);
-  loop_out_shape.AddDim(num_indices);
-  loop_out_shape.AppendShape(input_shape_post_axis);
-  TensorShape loop_out_slice_shape;
-  loop_out_slice_shape.AppendShape(input_shape_pre_axis);
-  loop_out_slice_shape.AddDim(1);
-  loop_out_slice_shape.AppendShape(input_shape_post_axis);
+    TensorShape indices_shape_no_index_vectors{indices_shape};
+    if (indices_are_nd) {
+      indices_shape_no_index_vectors.RemoveLastDims(1);
+    }
 
-  TensorShape out_shape;
-  out_shape.AppendShape(input_shape_pre_axis);
-  out_shape.AppendShape(indices_shape);
-  out_shape.AppendShape(input_shape_post_axis);
+    TensorShape out_shape;
+    out_shape.AppendShape(input_shape_pre_axis);
+    out_shape.AppendShape(indices_shape_no_index_vectors);
+    out_shape.AppendShape(input_shape_post_axis);
 
-  // Degenerate case: empty indices.
-  if (num_indices == 0) {
     *gather_output = builder->Broadcast(XlaHelpers::Zero(builder, dtype),
                                         out_shape.dim_sizes());
     return Status::OK();
@@ -88,76 +90,61 @@ Status XlaGather(const xla::ComputationDataHandle& input,
     }
   }
 
-  // Flatten the major dimensions of indices into a single dimension for ease of
-  // iteration. If there is an axis dimension, we must leave it alone.
-  std::vector<int64> flat_indices_shape = {num_indices};
-  if (indices_are_nd) {
-    flat_indices_shape.push_back(num_index_dims);
-  }
-
-  // Specify the shape of the loop-carried Tensor tuple.
-
-  // Construct the initial values of the loop-carried Tensors.
-  auto flat_indices = builder->Reshape(indices, flat_indices_shape);
-  auto init_out = builder->Broadcast(XlaHelpers::Zero(builder, dtype),
-                                     loop_out_shape.dim_sizes());
-  auto init = {input, flat_indices, init_out};
-
-  // Construct the while loop body's function. The implementation of gather is:
-  // for i in range(num_indices):
-  //   index = dynamic-slice(indices, i)
-  //   xi = dynamic-slice(input, index)
-  //   output = dynamic-update-slice(output, xi, i)
-  auto body_fn = [&](xla::ComputationDataHandle i,
-                     gtl::ArraySlice<xla::ComputationDataHandle> loop_vars,
-                     xla::ComputationBuilder* bodyb) {
-    auto input = loop_vars[0];
-    auto indices = loop_vars[1];
-    auto output = loop_vars[2];
-
-    auto zero_index = XlaHelpers::Zero(bodyb, index_type);
-
-    // Slice the i-th index from the indices array.
-    xla::ComputationDataHandle index;
-    auto indices_offset = bodyb->Reshape(i, {1});
-    if (indices_are_nd) {
-      // Slice out the entire nd index, if applicable.
-      indices_offset = bodyb->Pad(indices_offset, zero_index,
-                                  xla::MakeEdgePaddingConfig({{0, 1}}));
-      index = bodyb->DynamicSlice(indices, indices_offset, {1, num_index_dims});
-      index = bodyb->Collapse(index, {0, 1});
+  // Example of a 1-D gather with axis=1, pulling two [3,1] tensors out of a
+  // tensor of shape [3,3].
+  //
+  //  operand = s32[3,3] parameter(0)
+  //  indices = s32[2] parameter(1)
+  //  gather = s32[3,2] gather(operand, indices),
+  //       output_window_dims={0},
+  //       elided_window_dims={1},
+  //       gather_dims_to_operand_dims={1},
+  //       index_vector_dim=1,
+  //       window_bounds={3, 1}
+  //
+  //
+  // Example of an N-D gather pulling out slices of shape [1,1,2] out of a
+  // tensor of shape [3,3,2].
+  //
+  //  operand = s32[3,3,2] parameter(0)
+  //  indices = s32[2,2] parameter(1)
+  //  gather = s32[2,2] gather(operand, indices),
+  //       output_window_dims={1},
+  //       elided_window_dims={0,1},
+  //       gather_dims_to_operand_dims={0,1},
+  //       index_vector_dim=0,
+  //       window_bounds={1,1,2}
+
+  xla::GatherDimensionNumbers dim_numbers;
+  std::vector<int64> window_bounds;
+  window_bounds.reserve(input_shape.dims());
+  for (int64 i = 0; i < input_shape.dims(); i++) {
+    int64 window_bound;
+    if (axis <= i && i < (axis + num_index_dims)) {
+      dim_numbers.add_elided_window_dims(i);
+      window_bound = 1;
     } else {
-      index = bodyb->DynamicSlice(indices, indices_offset, {1});
+      window_bound = input_shape.dim_size(i);
+    }
+
+    window_bounds.push_back(window_bound);
+
+    if (i < axis) {
+      dim_numbers.add_output_window_dims(i);
+    } else if (i >= (axis + num_index_dims)) {
+      int64 indices_rank =
+          indices_are_nd ? (indices_shape.dims() - 1) : indices_shape.dims();
+      dim_numbers.add_output_window_dims(i + indices_rank - num_index_dims);
     }
+  }
+
+  dim_numbers.set_index_vector_dim(indices_are_nd ? (indices_shape.dims() - 1)
+                                                  : indices_shape.dims());
+  for (int64 i = axis; i < axis + num_index_dims; i++) {
+    dim_numbers.add_gather_dims_to_operand_dims(i);
+  }
 
-    // Slice the corresponding data from the input array.
-    auto start_indices = bodyb->Pad(
-        index, zero_index,
-        xla::MakeEdgePaddingConfig(
-            {{input_shape_pre_axis.dims(), input_shape_post_axis.dims()}}));
-    auto slice_i = bodyb->Reshape(
-        bodyb->DynamicSlice(input, start_indices, slice_shape.dim_sizes()),
-        loop_out_slice_shape.dim_sizes());
-
-    // Construct the index into the output Tensor 0, ..., <index>, 0, ...
-    std::vector<xla::ComputationDataHandle> out_index_vals(
-        loop_out_shape.dims(), bodyb->Reshape(zero_index, {1}));
-    out_index_vals[input_shape_pre_axis.dims()] = bodyb->Reshape(i, {1});
-    auto out_index = bodyb->ConcatInDim(out_index_vals, 0);
-
-    // Update the output Tensor
-    auto updated_output = bodyb->DynamicUpdateSlice(output, slice_i, out_index);
-
-    return std::vector<xla::ComputationDataHandle>{input, indices,
-                                                   updated_output};
-  };
-
-  // Construct the While loop, extract and reshape the output.
-  xla::PrimitiveType ptype;
-  TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(index_type, &ptype));
-  TF_ASSIGN_OR_RETURN(auto outputs, XlaForEachIndex(num_indices, ptype, body_fn,
-                                                    init, "gather", builder));
-  *gather_output = builder->Reshape(outputs[2], out_shape.dim_sizes());
+  *gather_output = builder->Gather(input, indices, dim_numbers, window_bounds);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
index bd8b92c22d..f9376f0eab 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
@@ -36,8 +36,8 @@ namespace tensorflow {
 Status XlaGather(const xla::ComputationDataHandle& input,
                  const TensorShape& input_shape,
                  const xla::ComputationDataHandle& indices,
-                 TensorShape indices_shape, int64 axis, bool indices_are_nd,
-                 DataType dtype, DataType index_type,
+                 const TensorShape& indices_shape, int64 axis,
+                 bool indices_are_nd, DataType dtype, DataType index_type,
                  xla::ComputationBuilder* builder,
                  xla::ComputationDataHandle* gather_output);
 
-- 
GitLab


From 603aad77e69ea856b39566769361c022b6af933a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 13:54:43 -0700
Subject: [PATCH 090/450] [XLA] Redesign: add arguments, result, and
 execution_platform to HloProto, because the SessionModule has those.

PiperOrigin-RevId: 193411310
---
 tensorflow/compiler/xla/service/hlo.proto | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 8fd7f8945c..0c3eb7dcb4 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -296,3 +296,20 @@ message HloProto {
   HloOrderingProto hlo_ordering = 2;
   BufferAssignmentProto buffer_assignment = 3;
 }
+
+// Encapsulates HloProto together with the arguments, result, and
+// execution_platform. This message is used for purposes such as
+// analysis/replay/file-storage.
+message HloSession {
+  // The hlo graph.
+  HloProto hlo = 1;
+
+  // The arguments passed to the graph.
+  repeated LiteralProto arguments = 2;
+
+  // The result of the graph.
+  LiteralProto result = 3;
+
+  // The name of the platform used to run the graph.
+  string execution_platform = 4;
+}
-- 
GitLab


From f3d2fdf088ea6674f0c0b034af04b99fc1a830dc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 13:54:52 -0700
Subject: [PATCH 091/450] Replace six.get_unbound_function with a simpler
 version that doesn't crash for methods of tf.keras.Model.

PiperOrigin-RevId: 193411332
---
 tensorflow/contrib/autograph/pyct/inspect_utils.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils.py b/tensorflow/contrib/autograph/pyct/inspect_utils.py
index 63361cc4f2..a0f56a6c1f 100644
--- a/tensorflow/contrib/autograph/pyct/inspect_utils.py
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils.py
@@ -63,14 +63,23 @@ def getnamespace(f):
   return namespace
 
 
+def _get_unbound_function(m):
+  # TODO(mdan): Figure out why six.get_unbound_function fails in some cases.
+  # The failure case is for tf.keras.Model.
+  if hasattr(m, 'im_func'):
+    return m.im_func
+  return m
+
+
 def getdefiningclass(m, owner_class):
   """Resolves the class (e.g. one of the superclasses) that defined a method."""
-  m = six.get_unbound_function(m)
+  # Normalize bound functions to their respective unbound versions.
+  m = _get_unbound_function(m)
   last_defining = owner_class
   for superclass in tf_inspect.getmro(owner_class):
     if hasattr(superclass, m.__name__):
       superclass_m = getattr(superclass, m.__name__)
-      if six.get_unbound_function(superclass_m) == m:
+      if _get_unbound_function(superclass_m) == m:
         last_defining = superclass
   return last_defining
 
-- 
GitLab


From 18fd1275a0c0e39a5cecea950a1fef3d8472e911 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 14:07:10 -0700
Subject: [PATCH 092/450] If the summary is empty, return empty list for
 quantiles.

PiperOrigin-RevId: 193413363
---
 .../lib/quantiles/weighted_quantiles_stream_test.cc | 13 +++++++++++++
 .../lib/quantiles/weighted_quantiles_summary.h      |  9 ++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream_test.cc b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream_test.cc
index 4481c0d0e4..67ac9bf387 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream_test.cc
@@ -138,6 +138,12 @@ void GenerateOneValue(int32 worker_id, int64 max_elements, double *total_weight,
   stream->Finalize();
 }
 
+void GenerateOneZeroWeightedValue(int32 worker_id, int64 max_elements,
+                                  double *total_weight, Stream *stream) {
+  stream->PushEntry(10, 0);
+  stream->Finalize();
+}
+
 TEST(WeightedQuantilesStreamTest, OneValue) {
   const double eps = 0.01;
   const int64 max_elements = 1 << 16;
@@ -145,6 +151,13 @@ TEST(WeightedQuantilesStreamTest, OneValue) {
                           {10.0, 10.0, 10.0, 10.0, 10.0}, 1e-2);
 }
 
+TEST(WeightedQuantilesStreamTest, OneZeroWeightValue) {
+  const double eps = 0.01;
+  const int64 max_elements = 1 << 16;
+  TestSingleWorkerStreams(eps, max_elements, GenerateOneZeroWeightedValue, {},
+                          1e-2);
+}
+
 TEST(WeightedQuantilesStreamTest, FixedUniform) {
   const double eps = 0.01;
   const int64 max_elements = 1 << 16;
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
index aec232f3cb..7576856dc3 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
@@ -235,6 +235,11 @@ class WeightedQuantilesSummary {
   // The resulting boundaries are guaranteed to both contain at least
   // num_boundaries unique elements and maintain approximation bounds.
   std::vector<ValueType> GenerateBoundaries(int64 num_boundaries) const {
+    std::vector<ValueType> output;
+    if (entries_.empty()) {
+      return output;
+    }
+
     // Generate soft compressed summary.
     WeightedQuantilesSummary<ValueType, WeightType, CompareFn>
         compressed_summary;
@@ -246,7 +251,6 @@ class WeightedQuantilesSummary {
     compressed_summary.Compress(num_boundaries, compression_eps);
 
     // Return boundaries.
-    std::vector<ValueType> output;
     output.reserve(compressed_summary.entries_.size());
     for (const auto& entry : compressed_summary.entries_) {
       output.push_back(entry.value);
@@ -260,6 +264,9 @@ class WeightedQuantilesSummary {
   // full rank queries O(nlogn).
   std::vector<ValueType> GenerateQuantiles(int64 num_quantiles) const {
     std::vector<ValueType> output;
+    if (entries_.empty()) {
+      return output;
+    }
     num_quantiles = std::max(num_quantiles, 2LL);
     output.reserve(num_quantiles + 1);
 
-- 
GitLab


From d61b579f10d2a56b0f8616aa1fe18e7827e3afec Mon Sep 17 00:00:00 2001
From: Chris Kennelly <ckennelly@google.com>
Date: Wed, 18 Apr 2018 14:07:23 -0700
Subject: [PATCH 093/450] Automated g4 rollback of changelist 193392688

PiperOrigin-RevId: 193413401
---
 tensorflow/core/framework/allocator.cc | 27 --------------------------
 tensorflow/core/framework/allocator.h  | 11 +----------
 2 files changed, 1 insertion(+), 37 deletions(-)

diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 29b67ebdfa..1a7e5219cd 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -48,10 +48,6 @@ constexpr size_t Allocator::kAllocatorAlignment;
 
 Allocator::~Allocator() {}
 
-void Allocator::DeallocateRaw(void* ptr, size_t alignment, size_t num_bytes) {
-  DeallocateRaw(ptr);
-}
-
 void RunResourceCtor(ResourceHandle* p, size_t n) {
   for (size_t i = 0; i < n; ++p, ++i) new (p) ResourceHandle();
 }
@@ -107,12 +103,7 @@ class CPUAllocator : public Allocator {
                    << "% of system memory.";
     }
 
-#ifdef __cpp_aligned_new
-    void* p =
-        ::operator new(num_bytes, static_cast<std::align_val_t>(alignment));
-#else
     void* p = port::AlignedMalloc(num_bytes, alignment);
-#endif
     if (cpu_allocator_collect_stats) {
       const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p);
       mutex_lock l(mu_);
@@ -141,25 +132,7 @@ class CPUAllocator : public Allocator {
       mutex_lock l(mu_);
       stats_.bytes_in_use -= alloc_size;
     }
-#ifdef __cpp_aligned_new
-    ::operator delete(ptr);
-#else
     port::AlignedFree(ptr);
-#endif
-  }
-
-  void DeallocateRaw(void* ptr, size_t alignment, size_t num_bytes) override {
-#ifdef __cpp_aligned_new
-    if (cpu_allocator_collect_stats) {
-      const std::size_t alloc_size =
-          port::MallocExtension_GetAllocatedSize(ptr);
-      mutex_lock l(mu_);
-      stats_.bytes_in_use -= alloc_size;
-    }
-    ::operator delete(ptr, num_bytes, static_cast<std::align_val_t>(alignment));
-#else
-    DeallocateRaw(ptr);
-#endif
   }
 
   void GetStats(AllocatorStats* stats) override {
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 0dda38fbb7..2c87156dca 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -101,11 +101,6 @@ class Allocator {
   // REQUIRES: "ptr" was previously returned by a call to AllocateRaw
   virtual void DeallocateRaw(void* ptr) = 0;
 
-  // Deallocate a block of memory pointer to by "ptr" with size "num_bytes"
-  // REQUIRES: "ptr" was previously returned by a call to AllocateRaw with
-  // "num_bytes" and "alignment"
-  virtual void DeallocateRaw(void* ptr, size_t alignment, size_t num_bytes);
-
   // Convenience functions to do typed allocation.  C++ constructors
   // and destructors are invoked for complex types if necessary,
   // depending on the concrete Allocator implementation. May return
@@ -137,7 +132,7 @@ class Allocator {
   void Deallocate(T* ptr, size_t num_elements) {
     if (ptr) {
       RunDtor<T>(ptr, num_elements);
-      DeallocateRaw(ptr, kAllocatorAlignment, sizeof(T) * num_elements);
+      DeallocateRaw(ptr);
     }
   }
 
@@ -309,10 +304,6 @@ class AllocatorWrapper : public Allocator {
 
   void DeallocateRaw(void* ptr) override { wrapped_->DeallocateRaw(ptr); }
 
-  void DeallocateRaw(void* ptr, size_t alignment, size_t num_bytes) override {
-    wrapped_->DeallocateRaw(ptr, alignment, num_bytes);
-  }
-
   bool TracksAllocationSizes() override {
     return wrapped_->TracksAllocationSizes();
   }
-- 
GitLab


From 497dc60720669434a9e6cf7ff19be9ca6d526010 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 14:30:01 -0700
Subject: [PATCH 094/450] Allow turning off checkpointing for
 ShardedMutableDenseHashTable. Keep the checkpointing=True as the default.

PiperOrigin-RevId: 193417350
---
 .../python/ops/sharded_mutable_dense_hashtable.py               | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
index ec726bbed4..5015fb0848 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
@@ -49,6 +49,7 @@ class ShardedMutableDenseHashTable(lookup.LookupInterface):
                default_value,
                empty_key,
                num_shards=1,
+               checkpoint=True,
                name='ShardedMutableHashTable'):
     with ops.name_scope(name, 'sharded_mutable_hash_table') as scope:
       super(ShardedMutableDenseHashTable, self).__init__(key_dtype,
@@ -61,6 +62,7 @@ class ShardedMutableDenseHashTable(lookup.LookupInterface):
                 value_dtype=value_dtype,
                 default_value=default_value,
                 empty_key=empty_key,
+                checkpoint=checkpoint,
                 name='%s-%d-of-%d' % (name, i + 1, num_shards)))
       self._table_shards = table_shards
       # TODO(andreasst): add a value_shape() method to LookupInterface
-- 
GitLab


From b75e1204d3aaab20d7a937edd6b2f05ff5785827 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 14:34:50 -0700
Subject: [PATCH 095/450] Increase shard count of :init_ops_test.

PiperOrigin-RevId: 193418147
---
 tensorflow/python/kernel_tests/BUILD | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index a02783e7e7..3aedd70f8c 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1603,10 +1603,8 @@ cuda_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
-    tags = [
-        "noasan",
-        "optonly",
-    ],
+    shard_count = 4,
+    tags = ["noasan"],
 )
 
 cuda_py_test(
-- 
GitLab


From 325ba9ece698d04082b173ba300a10623d27de96 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 14:38:07 -0700
Subject: [PATCH 096/450] Adds an implementation of the precision at recall
 metric.

PiperOrigin-RevId: 193418737
---
 tensorflow/contrib/metrics/__init__.py        |   2 +
 .../contrib/metrics/python/ops/metric_ops.py  | 115 +++++++++++++++
 .../metrics/python/ops/metric_ops_test.py     | 132 ++++++++++++++++++
 3 files changed, 249 insertions(+)

diff --git a/tensorflow/contrib/metrics/__init__.py b/tensorflow/contrib/metrics/__init__.py
index de02dc8f45..5effea3596 100644
--- a/tensorflow/contrib/metrics/__init__.py
+++ b/tensorflow/contrib/metrics/__init__.py
@@ -71,6 +71,7 @@ See the @{$python/contrib.metrics} guide.
 @@count
 @@precision_recall_at_equal_thresholds
 @@recall_at_precision
+@@precision_at_recall
 
 """
 from __future__ import absolute_import
@@ -87,6 +88,7 @@ from tensorflow.contrib.metrics.python.ops.metric_ops import aggregate_metrics
 from tensorflow.contrib.metrics.python.ops.metric_ops import auc_with_confidence_intervals
 from tensorflow.contrib.metrics.python.ops.metric_ops import cohen_kappa
 from tensorflow.contrib.metrics.python.ops.metric_ops import count
+from tensorflow.contrib.metrics.python.ops.metric_ops import precision_at_recall
 from tensorflow.contrib.metrics.python.ops.metric_ops import precision_recall_at_equal_thresholds
 from tensorflow.contrib.metrics.python.ops.metric_ops import recall_at_precision
 from tensorflow.contrib.metrics.python.ops.metric_ops import sparse_recall_at_top_k
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 9c8ae48094..5364e3075d 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -2588,6 +2588,121 @@ def recall_at_precision(labels,
     return recall, update_op
 
 
+def precision_at_recall(labels,
+                        predictions,
+                        target_recall,
+                        weights=None,
+                        num_thresholds=200,
+                        metrics_collections=None,
+                        updates_collections=None,
+                        name=None):
+  """Computes the precision at a given recall.
+
+  This function creates variables to track the true positives, false positives,
+  true negatives, and false negatives at a set of thresholds. Among those
+  thresholds where recall is at least `target_recall`, precision is computed
+  at the threshold where recall is closest to `target_recall`.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  precision at `target_recall`. `update_op` increments the counts of true
+  positives, false positives, true negatives, and false negatives with the
+  weight of each case found in the `predictions` and `labels`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  For additional information about precision and recall, see
+  http://en.wikipedia.org/wiki/Precision_and_recall
+
+  Args:
+    labels: The ground truth values, a `Tensor` whose dimensions must match
+      `predictions`. Will be cast to `bool`.
+    predictions: A floating point `Tensor` of arbitrary shape and whose values
+      are in the range `[0, 1]`.
+    target_recall: A scalar value in range `[0, 1]`.
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `labels` dimension).
+    num_thresholds: The number of thresholds to use for matching the given
+      recall.
+    metrics_collections: An optional list of collections to which `precision`
+      should be added.
+    updates_collections: An optional list of collections to which `update_op`
+      should be added.
+    name: An optional variable_scope name.
+
+  Returns:
+    precision: A scalar `Tensor` representing the precision at the given
+      `target_recall` value.
+    update_op: An operation that increments the variables for tracking the
+      true positives, false positives, true negatives, and false negatives and
+      whose value matches `precision`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      `target_recall` is not between 0 and 1, or if either `metrics_collections`
+      or `updates_collections` are not a list or tuple.
+    RuntimeError: If eager execution is enabled.
+  """
+  if context.executing_eagerly():
+    raise RuntimeError('tf.metrics.precision_at_recall is not '
+                       'supported when eager execution is enabled.')
+
+  if target_recall < 0 or target_recall > 1:
+    raise ValueError('`target_recall` must be in the range [0, 1].')
+
+  with variable_scope.variable_scope(name, 'precision_at_recall',
+                                     (predictions, labels, weights)):
+    kepsilon = 1e-7  # Used to avoid division by zero.
+    thresholds = [
+        (i + 1) * 1.0 / (num_thresholds - 1) for i in range(num_thresholds - 2)
+    ]
+    thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
+
+    values, update_ops = _streaming_confusion_matrix_at_thresholds(
+        predictions, labels, thresholds, weights)
+
+    def compute_precision_at_recall(tp, fp, fn, name):
+      """Computes the precision at a given recall.
+
+      Args:
+        tp: True positives.
+        fp: False positives.
+        fn: False negatives.
+        name: A name for the operation.
+
+      Returns:
+        The precision at the desired recall.
+      """
+      recalls = math_ops.div(tp, tp + fn + kepsilon)
+
+      # Because recall is monotone decreasing as a function of the threshold,
+      # the smallest recall exceeding target_recall occurs at the largest
+      # threshold where recall >= target_recall.
+      admissible_recalls = math_ops.cast(
+          math_ops.greater_equal(recalls, target_recall), dtypes.int64)
+      tf_index = math_ops.reduce_sum(admissible_recalls) - 1
+
+      # Now we have the threshold at which to compute precision:
+      return math_ops.div(tp[tf_index] + kepsilon,
+                          tp[tf_index] + fp[tf_index] + kepsilon,
+                          name)
+
+    precision_value = compute_precision_at_recall(
+        values['tp'], values['fp'], values['fn'], 'value')
+    update_op = compute_precision_at_recall(
+        update_ops['tp'], update_ops['fp'], update_ops['fn'], 'update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, precision_value)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return precision_value, update_op
+
+
 def streaming_sparse_average_precision_at_k(predictions,
                                             labels,
                                             k,
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index 33eb655fb6..76420db8bd 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -3380,6 +3380,138 @@ class RecallAtPrecisionTest(test.TestCase):
       self.assertAlmostEqual(target_recall, recall.eval())
 
 
+class PrecisionAtRecallTest(test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    ops.reset_default_graph()
+
+  def testVars(self):
+    metrics.precision_at_recall(
+        predictions=array_ops.ones((10, 1)),
+        labels=array_ops.ones((10, 1)),
+        target_recall=0.7)
+    _assert_metric_variables(self,
+                             ('precision_at_recall/true_positives:0',
+                              'precision_at_recall/false_negatives:0',
+                              'precision_at_recall/false_positives:0',
+                              'precision_at_recall/true_negatives:0'))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.precision_at_recall(
+        predictions=array_ops.ones((10, 1)),
+        labels=array_ops.ones((10, 1)),
+        target_recall=0.7,
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(ops.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.precision_at_recall(
+        predictions=array_ops.ones((10, 1)),
+        labels=array_ops.ones((10, 1)),
+        target_recall=0.7,
+        updates_collections=[my_collection_name])
+    self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = random_ops.random_uniform(
+        (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
+    labels = random_ops.random_uniform(
+        (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=1)
+    precision, update_op = metrics.precision_at_recall(
+        labels, predictions, target_recall=0.7)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_precision = precision.eval()
+      for _ in range(10):
+        self.assertAlmostEqual(initial_precision, precision.eval(), places=5)
+
+  def testAllCorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    predictions = constant_op.constant(inputs, dtype=dtypes_lib.float32)
+    labels = constant_op.constant(inputs)
+    precision, update_op = metrics.precision_at_recall(
+        labels, predictions, target_recall=0.7)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertEqual(1, sess.run(update_op))
+      self.assertEqual(1, precision.eval())
+
+  def testAllIncorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    predictions = constant_op.constant(inputs, dtype=dtypes_lib.float32)
+    labels = 1.0 - predictions
+    label_prior = math_ops.reduce_mean(labels)
+    precision, update_op = metrics.precision_at_recall(
+        labels, predictions, target_recall=0.2)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertEqual(sess.run(label_prior), sess.run(update_op))
+      self.assertEqual(sess.run(label_prior), precision.eval())
+
+  def testSomeCorrectHighRecall(self):
+    predictions_values = [0.1, 0.2, 0.5, 0.3, 0.0, 0.1, 0.45, 0.5, 0.8, 0.9]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    predictions = constant_op.constant(
+        predictions_values, dtype=dtypes_lib.float32)
+    labels = constant_op.constant(labels_values)
+    precision, update_op = metrics.precision_at_recall(
+        labels, predictions, target_recall=0.8)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.8, sess.run(update_op))
+      self.assertAlmostEqual(0.8, precision.eval())
+
+  def testSomeCorrectLowRecall(self):
+    predictions_values = [0.1, 0.2, 0.7, 0.3, 0.0, 0.1, 0.45, 0.5, 0.6, 0.9]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    predictions = constant_op.constant(
+        predictions_values, dtype=dtypes_lib.float32)
+    labels = constant_op.constant(labels_values)
+    precision, update_op = metrics.precision_at_recall(
+        labels, predictions, target_recall=0.4)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertAlmostEqual(2.0/3, sess.run(update_op))
+      self.assertAlmostEqual(2.0/3, precision.eval())
+
+  def testWeighted_multipleLabelDtypes(self):
+    for label_dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+      predictions_values = [
+          0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.22, 0.25, 0.31, 0.35]
+      labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+      weights_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+      predictions = constant_op.constant(
+          predictions_values, dtype=dtypes_lib.float32)
+      labels = math_ops.cast(labels_values, dtype=label_dtype)
+      weights = constant_op.constant(weights_values)
+      precision, update_op = metrics.precision_at_recall(
+          labels, predictions, target_recall=0.8, weights=weights)
+
+      with self.test_session() as sess:
+        sess.run(variables.local_variables_initializer())
+        self.assertAlmostEqual(34.0/43, sess.run(update_op))
+        self.assertAlmostEqual(34.0/43, precision.eval())
+
+
 class StreamingFNRThresholdsTest(test.TestCase):
 
   def setUp(self):
-- 
GitLab


From d964834a922e77198fd387aac6c6cc5970a31e7d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 15:02:26 -0700
Subject: [PATCH 097/450] Merged commit includes the following changes:
 193422827  by yifeif:

    Fix buildifier error.

--
193421691  by skyewm:

    Make GraphModeFunctions work with _USE_C_SHAPES=True.

    Tensor._handle_data is going away. This change adds special hooks for
    propagating the resource handle shape information through
    EagerTensors.

--
193421473  by A. Unique TensorFlower:

    Register dynamic_stitch for DT_VARIANT type.

--
193421175  by nolivia:

    disabling flaky tsan test

--
193420117  by nolivia:

    disabling flaky test in tensorflow that has no apparent culprit

--

PiperOrigin-RevId: 193422827
---
 tensorflow/c/eager/BUILD                      |  2 +
 tensorflow/c/eager/c_api.cc                   | 57 +++++++++++++++++++
 tensorflow/c/eager/c_api.h                    | 14 +++++
 .../contrib/rpc/python/kernel_tests/BUILD     |  1 +
 tensorflow/core/kernels/dynamic_stitch_op.cc  |  1 +
 tensorflow/python/eager/function.py           | 18 +++++-
 tensorflow/python/eager/function_test.py      |  3 +
 tensorflow/python/framework/test_util.py      | 24 ++++++++
 tensorflow/python/kernel_tests/BUILD          |  5 +-
 .../python/ops/resource_variable_ops.py       | 24 +++++++-
 tensorflow/python/pywrap_tfe.i                |  2 +
 11 files changed, 145 insertions(+), 6 deletions(-)

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index a2d96357ac..3e14c10727 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -41,6 +41,8 @@ tf_cuda_library(
             "//tensorflow/core:lib",
             "//tensorflow/core:lib_internal",
             "//tensorflow/core:protos_all_cc",
+            # TODO(b/74620627): move this here
+            "//tensorflow/python:cpp_shape_inference_proto_cc",
         ],
     }) + select({
         "//tensorflow:with_xla_support": [
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 393851d13c..369342b142 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/python/framework/cpp_shape_inference.pb.h"
 
 using tensorflow::int64;
 using tensorflow::string;
@@ -1015,6 +1016,62 @@ void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf,
   ctx->context.RunMetadataProto()->Clear();
 }
 
+void TFE_GetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output,
+                                       TF_Buffer* output_proto,
+                                       TF_Status* status) {
+  tensorflow::Node* node = &output.oper->node;
+  tensorflow::CppShapeInferenceResult::HandleData handle_data;
+  handle_data.set_is_set(true);
+  {
+    tensorflow::mutex_lock l(graph->mu);
+    tensorflow::shape_inference::InferenceContext* ic =
+        graph->refiner.GetContext(node);
+    CHECK(ic != nullptr);
+    CHECK_LT(output.index, ic->num_outputs());
+    const auto* shapes_and_types =
+        ic->output_handle_shapes_and_types(output.index);
+    if (shapes_and_types == nullptr) {
+      output_proto->data = nullptr;
+      output_proto->length = 0;
+      output_proto->data_deallocator = nullptr;
+      return;
+    }
+
+    for (const auto& p : *shapes_and_types) {
+      auto* out_shape_and_type = handle_data.add_shape_and_type();
+      ic->ShapeHandleToProto(p.shape, out_shape_and_type->mutable_shape());
+      out_shape_and_type->set_dtype(p.dtype);
+    }
+  }
+  status->status = MessageToBuffer(handle_data, output_proto);
+}
+
+void TFE_SetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output,
+                                       const void* proto, size_t proto_len,
+                                       TF_Status* status) {
+  tensorflow::CppShapeInferenceResult::HandleData handle_data;
+  if (!handle_data.ParseFromArray(proto, proto_len)) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Couldn't deserialize HandleData proto");
+    return;
+  }
+  DCHECK(handle_data.is_set());
+
+  tensorflow::mutex_lock l(graph->mu);
+  tensorflow::shape_inference::InferenceContext* ic =
+      graph->refiner.GetContext(&output.oper->node);
+
+  std::vector<tensorflow::shape_inference::ShapeAndType> shapes_and_types;
+  for (const auto& shape_and_type_proto : handle_data.shape_and_type()) {
+    tensorflow::shape_inference::ShapeHandle shape;
+    status->status =
+        ic->MakeShapeFromShapeProto(shape_and_type_proto.shape(), &shape);
+    if (status->status.ok()) return;
+    shapes_and_types.emplace_back(shape, shape_and_type_proto.dtype());
+  }
+  ic->set_output_handle_shapes_and_types(output.index, shapes_and_types);
+}
+
 namespace {
 TFE_Op* GetFunc(TFE_Context* ctx, const tensorflow::NameAttrList& func,
                 TF_Status* status) {
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 3926c22ce1..15ac0f376c 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -329,6 +329,20 @@ TF_CAPI_EXPORT extern void TFE_ContextExportRunMetadata(TFE_Context* ctx,
                                                         TF_Buffer* buf,
                                                         TF_Status* status);
 
+// Returns the serialized CppShapeInferenceResult::HandleData proto for
+// `output` if its a resource tensor, or otherwise returns an empty buffer.
+TF_CAPI_EXPORT extern void TFE_GetResourceHandleShapeAndType(
+    TF_Graph* graph, TF_Output output, TF_Buffer* output_proto,
+    TF_Status* status);
+
+// Sets `output` based on `proto`, which should be a serialized
+// CppShapeInferenceResult::HandleData proto.
+TF_CAPI_EXPORT extern void TFE_SetResourceHandleShapeAndType(TF_Graph* graph,
+                                                             TF_Output output,
+                                                             const void* proto,
+                                                             size_t proto_len,
+                                                             TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/BUILD b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
index 2311c15a68..f3e6731213 100644
--- a/tensorflow/contrib/rpc/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
@@ -28,6 +28,7 @@ py_library(
 py_library(
     name = "rpc_op_test_base",
     srcs = ["rpc_op_test_base.py"],
+    tags = ["notsan"],
     deps = [
         ":test_example_proto_py",
         "//tensorflow/contrib/proto",
diff --git a/tensorflow/core/kernels/dynamic_stitch_op.cc b/tensorflow/core/kernels/dynamic_stitch_op.cc
index f018499f6c..b01db91720 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op.cc
@@ -326,6 +326,7 @@ struct ParallelDynamicStitchOpCPU : DynamicStitchOpImplCPU<T, true> {
                           ParallelDynamicStitchOpCPU<type>)
 
 TF_CALL_POD_STRING_TYPES(REGISTER_DYNAMIC_STITCH);
+TF_CALL_variant(REGISTER_DYNAMIC_STITCH);
 #undef REGISTER_DYNAMIC_STITCH
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 5168ad3b18..0f1170bb42 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -38,6 +38,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
@@ -69,9 +70,22 @@ def capture_value(tensor_map, value, dtype, name):
     captured_value = graph_placeholder(
         dtype=dtype or value.dtype, shape=value.shape, name=name)
     if captured_value.dtype == dtypes_module.resource:
-      handle_data = value._handle_data  # pylint: disable=protected-access
-      captured_value._handle_data = handle_data  # pylint: disable=protected-access
+      if ops._USE_C_SHAPES:  # pylint: disable=protected-access
+        if isinstance(value, ops.EagerTensor):
+          handle_data = value._handle_data  # pylint: disable=protected-access
+        else:
+          handle_data = resource_variable_ops.get_resource_handle_data(value)
+      else:
+        handle_data = value._handle_data  # pylint: disable=protected-access
       if handle_data is not None and handle_data.is_set:
+        # pylint: disable=protected-access
+        if ops._USE_C_SHAPES:
+          pywrap_tensorflow.TFE_SetResourceHandleShapeAndType(
+              captured_value.graph._c_graph, captured_value._as_tf_output(),
+              handle_data.SerializeToString())
+        else:
+          captured_value._handle_data = handle_data
+        # pylint: enable=protected-access
         # Ensure that shapes and dtypes are propagated.
         shapes, types = zip(*[(pair.shape, pair.dtype)
                               for pair in handle_data.shape_and_type])
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 65dde75e60..1828c987f4 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function as tf_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
@@ -41,6 +42,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.training import gradient_descent
 
 
+@test_util.with_c_shapes
 class FunctionTest(test.TestCase):
 
   def testBasic(self):
@@ -615,6 +617,7 @@ class FunctionTest(test.TestCase):
     self.assertAllEqual([[[[4.0]]]], y.numpy())
 
 
+@test_util.with_c_shapes
 class AutomaticControlDependenciesTest(test.TestCase):
 
   def testBasic(self):
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 70e70abc06..f954b9d6c7 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -464,6 +464,30 @@ def with_c_api(cls):
   return cls
 
 
+def with_c_shapes(cls):
+  """Adds methods that call original methods but with C API shapes enabled.
+
+  Note this enables C shapes in new methods after running the test class's
+  setup method.
+
+  Args:
+    cls: class to decorate
+
+  Returns:
+    cls with new test methods added
+  """
+  # If C shapes are already enabled, don't do anything. Some tests break if the
+  # same test is run twice, so this allows us to turn on the C shapes by default
+  # without breaking these tests.
+  if ops._USE_C_SHAPES:
+    return cls
+
+  for name, value in cls.__dict__.copy().items():
+    if callable(value) and name.startswith("test"):
+      setattr(cls, name + "WithCShapes", enable_c_shapes(value))
+  return cls
+
+
 def assert_no_new_pyobjects_executing_eagerly(f):
   """Decorator for asserting that no new Python objects persist after a test.
 
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 3aedd70f8c..9440f2a4f9 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1604,7 +1604,10 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 4,
-    tags = ["noasan"],
+    tags = [
+        "noasan",
+        "notap",
+    ],
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 49dd7f9948..4d26b2f46e 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -24,6 +24,8 @@ from tensorflow.core.framework import variable_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
+from tensorflow.python.framework import c_api_util
+from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -41,6 +43,19 @@ from tensorflow.python.training import checkpointable
 from tensorflow.python.util import compat
 
 
+def get_resource_handle_data(graph_op):
+  assert ops._USE_C_SHAPES  # pylint: disable=protected-access
+  assert type(graph_op) == ops.Tensor  # pylint: disable=unidiomatic-typecheck
+
+  with c_api_util.tf_buffer() as buf:
+    pywrap_tensorflow.TFE_GetResourceHandleShapeAndType(
+        graph_op.graph._c_graph, graph_op._as_tf_output(), buf)  # pylint: disable=protected-access
+    data = pywrap_tensorflow.TF_GetBuffer(buf)
+
+  return cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData.FromString(
+      compat.as_bytes(data))
+
+
 def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
   """Creates a variable handle with information to do shape inference."""
   container = ops.get_default_graph()._container  # pylint: disable=protected-access
@@ -73,9 +88,12 @@ def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
     # shape inference doesn't run in eager mode we copy this data here for when
     # the handle is captured by an eager mode function.
     # pylint: disable=protected-access
-    if h._handle_data is None:
-      ops.set_shape_and_handle_data_for_outputs(h.op)
-    handle._handle_data = h._handle_data
+    if ops._USE_C_SHAPES:
+      handle._handle_data = get_resource_handle_data(h)
+    else:
+      if h._handle_data is None:
+        ops.set_shape_and_handle_data_for_outputs(h.op)
+      handle._handle_data = h._handle_data
     # pylint: enable=protected-access
 
   # Clean up our reference cycles to avoid making the garbage collector run.
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index 5ee55301df..0982a67dee 100644
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -59,6 +59,8 @@ limitations under the License.
 %rename("%s") TFE_ContextOptionsSetAsync;
 %rename("%s") TFE_DeleteContextOptions;
 %rename("%s") TFE_Py_TensorShapeSlice;
+%rename("%s") TFE_GetResourceHandleShapeAndType;
+%rename("%s") TFE_SetResourceHandleShapeAndType;
 
 %{
 #include "tensorflow/python/eager/pywrap_tfe.h"
-- 
GitLab


From a655d1670c264652efc42c1b12565232e22b8b84 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 15:02:44 -0700
Subject: [PATCH 098/450] Fix a bug in GcsFileSystem that inconsistent read
 error may not be revealed if the requested read size is larger than the block
 size.

PiperOrigin-RevId: 193422905
---
 .../core/platform/cloud/gcs_file_system.cc    |  2 +-
 .../platform/cloud/gcs_file_system_test.cc    | 41 +++++++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 6ed1d5dad2..f0003fa784 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -840,7 +840,7 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
 
   throttle_.RecordResponse(bytes_read);
 
-  if (bytes_read < block_size()) {
+  if (bytes_read < n) {
     // Check stat cache to see if we encountered an interrupted read.
     FileStatistics stat;
     if (stat_cache_->Lookup(filename, &stat)) {
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index e9eca04fef..ca4b7722b6 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -360,6 +360,47 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoObjectName) {
             fs.NewRandomAccessFile("gs://bucket/", &file).code());
 }
 
+TEST(GcsFileSystemTest, NewRandomAccessFile_InconsistentRead) {
+  std::vector<HttpRequest*> requests(
+      {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "random_access.txt?fields=size%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"6\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-5\n"
+           "Timeouts: 5 1 20\n",
+           "012")});
+
+  // Set stat_cache_max_age to 1000s so that StatCache could work.
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   1e3 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */,
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
+
+  // Stat the file first so that the file stats are cached.
+  FileStatistics stat;
+  TF_ASSERT_OK(fs.Stat("gs://bucket/random_access.txt", &stat));
+
+  std::unique_ptr<RandomAccessFile> file;
+  TF_ASSERT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+
+  char scratch[6];
+  StringPiece result;
+
+  EXPECT_EQ(errors::Code::INTERNAL,
+            file->Read(0, sizeof(scratch), &result, scratch).code());
+}
+
 TEST(GcsFileSystemTest, NewWritableFile) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
-- 
GitLab


From 5c1e253344c0a9d90b27eeef6dd5fcf76b74bba5 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Wed, 18 Apr 2018 15:04:21 -0700
Subject: [PATCH 099/450] Fix loss computation bug in Model training/eval
 methods with eager execution enabled. Fixes #18642.

PiperOrigin-RevId: 193423288
---
 .../_impl/keras/engine/training_eager.py      |  2 +-
 .../_impl/keras/engine/training_eager_test.py | 25 +++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
index 4cdb5f108a..695669d9ee 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
@@ -150,7 +150,7 @@ def _model_loss(model, inputs, targets, sample_weights=None, training=False):
       weighted_masked_fn = training_utils.weighted_masked_objective(loss_fn)
       with backend.name_scope(model.output_names[i] + '_loss'):
         output_loss = weighted_masked_fn(
-            outs[i], targets[i], weights, mask=mask)
+            targets[i], outs[i], weights, mask=mask)
       loss_metrics.append(backend.mean(output_loss))
 
       loss_weight = model.loss_weights_list[i]
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
index 6cdb6b0753..ed0f91ee1e 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.platform import test
@@ -625,6 +626,30 @@ class LossWeightingTest(test.TestCase):
       model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np})
 
 
+class CorrectnessTest(test.TestCase):
+
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_loss_correctness(self):
+    # Test that training loss is the same in eager and graph
+    # (by comparing it to a reference value in a deterministic case)
+    model = keras.Sequential()
+    model.add(keras.layers.Dense(3,
+                                 activation='relu',
+                                 input_dim=4,
+                                 kernel_initializer='ones'))
+    model.add(keras.layers.Dense(2,
+                                 activation='softmax',
+                                 kernel_initializer='ones'))
+    model.compile(loss='sparse_categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+    x = np.ones((100, 4))
+    np.random.seed(123)
+    y = np.random.randint(0, 1, size=(100, 1))
+    history = model.fit(x, y, epochs=1, batch_size=10)
+    self.assertEqual(
+        np.around(history.history['loss'][-1], decimals=4), 0.6173)
+
+
 if __name__ == '__main__':
   ops.enable_eager_execution()
   test.main()
-- 
GitLab


From e662c3fcfcd03fd091b032a5a33971428f4cdb89 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 15:25:42 -0700
Subject: [PATCH 100/450] A very simple AST pattern matcher. Only supports
 wildcards, and it's minimally tested. When using, you may want to add your
 use case to the tests.

PiperOrigin-RevId: 193426859
---
 tensorflow/contrib/autograph/pyct/ast_util.py | 79 ++++++++++++++++++-
 .../contrib/autograph/pyct/ast_util_test.py   | 28 ++++++-
 2 files changed, 105 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/autograph/pyct/ast_util.py b/tensorflow/contrib/autograph/pyct/ast_util.py
index 4a70bab440..c4f82d1170 100644
--- a/tensorflow/contrib/autograph/pyct/ast_util.py
+++ b/tensorflow/contrib/autograph/pyct/ast_util.py
@@ -23,10 +23,11 @@ import ast
 import gast
 
 from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import parser
 
 
 class CleanCopier(gast.NodeVisitor):
-  """Copy AST nodes.
+  """Copies AST nodes.
 
   The copied nodes will ignore almost all fields that are prefixed by '__'.
   Exceptions make some annotations.
@@ -106,3 +107,79 @@ def keywords_to_dict(keywords):
     keys.append(gast.Str(kw.arg))
     values.append(kw.value)
   return gast.Dict(keys=keys, values=values)
+
+
+class PatternMatcher(gast.NodeVisitor):
+  """Matches a node against a pattern represented by a node.
+
+  The pattern may contain wildcards represented by the symbol '_'.
+  """
+
+  def __init__(self, pattern):
+    self.pattern = pattern
+    self.pattern_stack = []
+    self.matches = True
+
+  def compare_and_visit(self, node, pattern):
+    self.pattern_stack.append(self.pattern)
+    self.pattern = pattern
+    self.generic_visit(node)
+    self.pattern = self.pattern_stack.pop()
+
+  def no_match(self):
+    self.matches = False
+    return False
+
+  def is_wildcard(self, p):
+    if isinstance(p, (list, tuple)) and len(p) == 1:
+      p, = p
+    if isinstance(p, gast.Name) and p.id == '_':
+      return True
+    if p == '_':
+      return True
+    return False
+
+  def generic_visit(self, node):
+    if not self.matches:
+      return
+
+    pattern = self.pattern
+    for f in node._fields:
+      if f.startswith('__'):
+        continue
+
+      if not hasattr(node, f):
+        if hasattr(pattern, f) and getattr(pattern, f):
+          return self.no_match()
+        else:
+          continue
+      if not hasattr(pattern, f):
+        return self.no_match()
+
+      v = getattr(node, f)
+      p = getattr(pattern, f)
+
+      if self.is_wildcard(p):
+        continue
+      if isinstance(v, (list, tuple)):
+        if not isinstance(p, (list, tuple)) or len(v) != len(p):
+          return self.no_match()
+        for v_item, p_item in zip(v, p):
+          self.compare_and_visit(v_item, p_item)
+      elif isinstance(v, (gast.AST, ast.AST)):
+        if not isinstance(v, type(p)) and not isinstance(p, type(v)):
+          return self.no_match()
+        self.compare_and_visit(v, p)
+      else:
+        # Assume everything else is a value type.
+        if v != p:
+          return self.no_match()
+
+
+def matches(node, pattern):
+  if isinstance(pattern, str):
+    pattern = parser.parse_expression(pattern)
+  matcher = PatternMatcher(pattern)
+  matcher.visit(node)
+  return matcher.matches
+
diff --git a/tensorflow/contrib/autograph/pyct/ast_util_test.py b/tensorflow/contrib/autograph/pyct/ast_util_test.py
index 8faf92c705..3afa04a506 100644
--- a/tensorflow/contrib/autograph/pyct/ast_util_test.py
+++ b/tensorflow/contrib/autograph/pyct/ast_util_test.py
@@ -85,7 +85,33 @@ class AstUtilTest(test.TestCase):
     output.body += (ast.Assign([ast.Name(id='d', ctx=ast.Store())], d),)
     result, _ = compiler.ast_to_object(output)
     self.assertDictEqual(result.d, {'a': 3, 'c': 1, 'd': 'e'})
-    print(d)
+
+  def assertMatch(self, target_str, pattern_str):
+    node = parser.parse_expression(target_str)
+    pattern = parser.parse_expression(pattern_str)
+    self.assertTrue(ast_util.matches(node, pattern))
+
+  def assertNoMatch(self, target_str, pattern_str):
+    node = parser.parse_expression(target_str)
+    pattern = parser.parse_expression(pattern_str)
+    self.assertFalse(ast_util.matches(node, pattern))
+
+  def test_matches_symbols(self):
+    self.assertMatch('foo', '_')
+    self.assertNoMatch('foo()', '_')
+    self.assertMatch('foo + bar', 'foo + _')
+    self.assertNoMatch('bar + bar', 'foo + _')
+    self.assertNoMatch('foo - bar', 'foo + _')
+
+  def test_matches_function_args(self):
+    self.assertMatch('super(Foo, self).__init__(arg1, arg2)',
+                     'super(_).__init__(_)')
+    self.assertMatch('super().__init__()', 'super(_).__init__(_)')
+    self.assertNoMatch('super(Foo, self).bar(arg1, arg2)',
+                       'super(_).__init__(_)')
+    self.assertMatch('super(Foo, self).__init__()', 'super(Foo, _).__init__(_)')
+    self.assertNoMatch('super(Foo, self).__init__()',
+                       'super(Bar, _).__init__(_)')
 
 
 if __name__ == '__main__':
-- 
GitLab


From 80f60ea37ed77b3dbe1d983f101a5efba2fd4f2e Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 18 Apr 2018 15:27:05 -0700
Subject: [PATCH 101/450] Never use the LegacySession when a Master explicitly
 calls CreateWorkerSession.

Previously, if the session handle was unrecognized by the worker, it
would default to using the LegacySession. This prevents us from
noticing that a server has been restarted.

To address the problem in a backwards-compatible way, we add a bit to
each session-handle-carrying worker request, indicating whether the
master believes that CreateWorkerSession has been called. If this bit
is set and the handle is unrecognized, the worker will raise an
AbortedError, which can be caught by high-level frameworks such as
`tf.estimator`.

Note that CreateWorkerSession is not yet used by default, and a
follow-up change will add that.

PiperOrigin-RevId: 193427057
---
 .../cluster_function_library_runtime.cc       |  2 +
 .../cluster_function_library_runtime.h        |  7 +-
 .../cluster_function_library_runtime_test.cc  |  2 +-
 .../distributed_runtime/master_session.cc     |  3 +
 .../distributed_runtime/message_wrappers.cc   | 23 +++++++
 .../distributed_runtime/message_wrappers.h    | 10 +++
 .../core/distributed_runtime/session_mgr.cc   | 24 ++++---
 .../core/distributed_runtime/session_mgr.h    |  8 ++-
 .../distributed_runtime/session_mgr_test.cc   | 34 +++++++---
 tensorflow/core/distributed_runtime/worker.cc | 67 ++++++++++++++-----
 .../distributed_runtime/worker_session.cc     |  3 +-
 tensorflow/core/protobuf/worker.proto         | 11 ++-
 12 files changed, 153 insertions(+), 41 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
index 000a03da5d..6edc2ec5ed 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
@@ -145,6 +145,7 @@ Status ClusterFunctionLibraryRuntime::Instantiate(
 
   RegisterGraphRequest req;
   req.set_session_handle(worker_session_->session_name);
+  req.set_create_worker_session_called(create_worker_session_called_);
   *req.mutable_graph_def() = gdef;
   req.mutable_graph_options()
       ->mutable_optimizer_options()
@@ -182,6 +183,7 @@ void ClusterFunctionLibraryRuntime::Run(
 
   RunGraphRequest* req = new RunGraphRequest;
   req->set_session_handle(worker_session_->session_name);
+  req->set_create_worker_session_called(create_worker_session_called_);
   req->set_graph_handle(function_data->graph_handle);
   // Borrowed from master_session.cc
   const uint64 step_id = (random::New64() & ((1uLL << 56) - 1)) | (1uLL << 56);
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
index d3ca350e36..1ea0a3ad51 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
@@ -27,8 +27,10 @@ struct WorkerSession;
 // functions across processes by making RPCs.
 class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
  public:
-  ClusterFunctionLibraryRuntime(WorkerSession* worker_session)
-      : worker_session_(worker_session) {}
+  ClusterFunctionLibraryRuntime(WorkerSession* worker_session,
+                                bool create_worker_session_called)
+      : worker_session_(worker_session),
+        create_worker_session_called_(create_worker_session_called) {}
 
   ~ClusterFunctionLibraryRuntime() override;
 
@@ -51,6 +53,7 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
 
   mutable mutex mu_;
   WorkerSession* const worker_session_ = nullptr;  // not owned.
+  const bool create_worker_session_called_;
 
   struct FunctionData {
     const string graph_handle;
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
index 1810996ab8..6f96d7cb06 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
@@ -44,7 +44,7 @@ class ClusterFunctionLibraryRuntimeTest : public ::testing::Test {
         std::unique_ptr<GraphMgr>()));
 
     cluster_flr_.reset(
-        new ClusterFunctionLibraryRuntime(worker_session_.get()));
+        new ClusterFunctionLibraryRuntime(worker_session_.get(), true));
   }
 
   Status ConstructFunctionGraphHelper(
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index e0a5bb4c53..08020f0266 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -431,6 +431,7 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
     const Part& part = partitions_[i];
     Call* c = &calls[i];
     c->req.set_session_handle(session_handle_);
+    c->req.set_create_worker_session_called(!should_deregister_);
     c->req.mutable_graph_def()->Swap(&graph_partitions[part.name]);
     *c->req.mutable_graph_options() = session_opts_.config.graph_options();
     *c->req.mutable_debug_options() =
@@ -587,6 +588,7 @@ Status MasterSession::ReffedClientGraph::RunPartitionsHelper(
       c->req->set_is_last_partial_run(is_last_partial_run);
     }
     c->req->set_session_handle(session_handle_);
+    c->req->set_create_worker_session_called(!should_deregister_);
     c->req->set_graph_handle(part.graph_handle);
     c->req->set_step_id(step_id);
     *c->req->mutable_exec_opts() = exec_opts;
@@ -1003,6 +1005,7 @@ void MasterSession::ReffedClientGraph::DeregisterPartitions() {
     if (!part.graph_handle.empty()) {
       Call* c = new Call;
       c->req.set_session_handle(session_handle_);
+      c->req.set_create_worker_session_called(!should_deregister_);
       c->req.set_graph_handle(part.graph_handle);
       // NOTE(mrry): We must capture `worker_cache_` since `this`
       // could be deleted before the callback is called.
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.cc b/tensorflow/core/distributed_runtime/message_wrappers.cc
index 18668b44d3..40bf564cab 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers.cc
@@ -282,10 +282,18 @@ const string& InMemoryRunGraphRequest::session_handle() const {
   return session_handle_;
 }
 
+bool InMemoryRunGraphRequest::create_worker_session_called() const {
+  return create_worker_session_called_;
+}
+
 void InMemoryRunGraphRequest::set_session_handle(const string& handle) {
   session_handle_ = handle;
 }
 
+void InMemoryRunGraphRequest::set_create_worker_session_called(bool called) {
+  create_worker_session_called_ = called;
+}
+
 const string& InMemoryRunGraphRequest::graph_handle() const {
   return graph_handle_;
 }
@@ -378,6 +386,8 @@ const RunGraphRequest& InMemoryRunGraphRequest::ToProto() const {
   if (!proto_version_) {
     proto_version_.reset(new RunGraphRequest);
     proto_version_->set_session_handle(session_handle());
+    proto_version_->set_create_worker_session_called(
+        create_worker_session_called());
     proto_version_->set_graph_handle(graph_handle());
     proto_version_->set_step_id(step_id());
     *proto_version_->mutable_exec_opts() = exec_opts();
@@ -403,6 +413,15 @@ void MutableProtoRunGraphRequest::set_session_handle(const string& handle) {
   request_.set_session_handle(handle);
 }
 
+bool MutableProtoRunGraphRequest::create_worker_session_called() const {
+  return request_.create_worker_session_called();
+}
+
+void MutableProtoRunGraphRequest::set_create_worker_session_called(
+    bool called) {
+  request_.set_create_worker_session_called(called);
+}
+
 const string& MutableProtoRunGraphRequest::graph_handle() const {
   return request_.graph_handle();
 }
@@ -514,6 +533,10 @@ const string& ProtoRunGraphRequest::session_handle() const {
   return request_->session_handle();
 }
 
+bool ProtoRunGraphRequest::create_worker_session_called() const {
+  return request_->create_worker_session_called();
+}
+
 const string& ProtoRunGraphRequest::graph_handle() const {
   return request_->graph_handle();
 }
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index 1f7cdb98a4..92c5668e3a 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -246,6 +246,9 @@ class RunGraphRequestWrapper {
   // namespace is used.
   virtual const string& session_handle() const = 0;
 
+  // Set to true if `CreateWorkerSession` was called for `session_handle`.
+  virtual bool create_worker_session_called() const = 0;
+
   // REQUIRED: graph_handle must be returned by a RegisterGraph call
   // to the same WorkerService.
   virtual const string& graph_handle() const = 0;
@@ -293,6 +296,7 @@ class RunGraphRequestWrapper {
 class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
  public:
   virtual void set_session_handle(const string& handle) = 0;
+  virtual void set_create_worker_session_called(bool called) = 0;
   virtual void set_graph_handle(const string& handle) = 0;
   virtual void set_step_id(int64 step_id) = 0;
   virtual ExecutorOpts* mutable_exec_opts() = 0;
@@ -317,6 +321,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
   // RunGraphRequestWrapper methods.
   const string& session_handle() const override;
   const string& graph_handle() const override;
+  bool create_worker_session_called() const override;
   int64 step_id() const override;
   const ExecutorOpts& exec_opts() const override;
   size_t num_sends() const override;
@@ -331,6 +336,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
 
   // MutableRunGraphRequestWrapper methods.
   void set_session_handle(const string& handle) override;
+  void set_create_worker_session_called(bool called) override;
   void set_graph_handle(const string& handle) override;
   void set_step_id(int64 step_id) override;
   ExecutorOpts* mutable_exec_opts() override;
@@ -347,6 +353,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
 
  private:
   string session_handle_;
+  bool create_worker_session_called_;
   string graph_handle_;
   int64 step_id_;
   ExecutorOpts exec_opts_;
@@ -370,6 +377,7 @@ class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
  public:
   // RunGraphRequestWrapper methods.
   const string& session_handle() const override;
+  bool create_worker_session_called() const override;
   const string& graph_handle() const override;
   int64 step_id() const override;
   const ExecutorOpts& exec_opts() const override;
@@ -385,6 +393,7 @@ class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
 
   // MutableRunGraphRequestWrapper methods.
   void set_session_handle(const string& handle) override;
+  void set_create_worker_session_called(bool called) override;
   void set_graph_handle(const string& handle) override;
   void set_step_id(int64 step_id) override;
   ExecutorOpts* mutable_exec_opts() override;
@@ -409,6 +418,7 @@ class ProtoRunGraphRequest : public RunGraphRequestWrapper {
 
   // RunGraphRequestWrapper methods.
   const string& session_handle() const override;
+  bool create_worker_session_called() const override;
   const string& graph_handle() const override;
   int64 step_id() const override;
   const ExecutorOpts& exec_opts() const override;
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 51b9547f53..e51d63cf2b 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -98,20 +98,26 @@ Status SessionMgr::DeleteSession(const string& session) {
   return Status::OK();
 }
 
-std::shared_ptr<WorkerSession> SessionMgr::WorkerSessionForSessionUnlocked(
-    const string& session) {
-  auto it = sessions_.find(session);
-  if (it == sessions_.end()) {
-    return legacy_session_;
+Status SessionMgr::WorkerSessionForSessionLocked(
+    const string& session_handle, std::shared_ptr<WorkerSession>* out_session) {
+  if (session_handle.empty()) {
+    *out_session = legacy_session_;
   } else {
-    return it->second;
+    auto it = sessions_.find(session_handle);
+    if (it == sessions_.end()) {
+      return errors::Aborted("Session handle is not found: ", session_handle,
+                             ". Possibly this worker just restarted.");
+    } else {
+      *out_session = it->second;
+    }
   }
+  return Status::OK();
 }
 
-std::shared_ptr<WorkerSession> SessionMgr::WorkerSessionForSession(
-    const string& session) {
+Status SessionMgr::WorkerSessionForSession(
+    const string& session_handle, std::shared_ptr<WorkerSession>* out_session) {
   mutex_lock l(mu_);
-  return WorkerSessionForSessionUnlocked(session);
+  return WorkerSessionForSessionLocked(session_handle, out_session);
 }
 
 std::shared_ptr<WorkerSession> SessionMgr::LegacySession() {
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index 4c9702d522..0a10fe240f 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -50,7 +50,8 @@ class SessionMgr {
                        bool isolate_session_state);
 
   // Locates the worker session for a given session handle
-  std::shared_ptr<WorkerSession> WorkerSessionForSession(const string& session);
+  Status WorkerSessionForSession(const string& session_handle,
+                                 std::shared_ptr<WorkerSession>* out_session);
   std::shared_ptr<WorkerSession> LegacySession();
 
   Status DeleteSession(const string& session);
@@ -86,8 +87,9 @@ class SessionMgr {
 
   const WorkerCacheFactory worker_cache_factory_;
 
-  std::shared_ptr<WorkerSession> WorkerSessionForSessionUnlocked(
-      const string& session) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  Status WorkerSessionForSessionLocked(
+      const string& session_handle, std::shared_ptr<WorkerSession>* out_session)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   mutex mu_;
   // A map from session identifier to internal session structure.
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index 4d028f7f4a..858e636e08 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -46,8 +46,8 @@ class SessionMgrTest : public ::testing::Test {
       : device_(FakeDevice::MakeCPU(
             "/job:mnist/replica:0/task:0/device:fakecpu:0")),
         mgr_(&env_, "/job:mnist/replica:0/task:0",
-             std::unique_ptr<WorkerCacheInterface>(), factory_),
-        legacy_session_(mgr_.WorkerSessionForSession("novel_session_id")) {
+             std::unique_ptr<WorkerCacheInterface>(), factory_) {
+    TF_CHECK_OK(mgr_.WorkerSessionForSession("", &legacy_session_));
     env_.local_devices = {device_.get()};
   }
 
@@ -69,7 +69,8 @@ TEST_F(SessionMgrTest, CreateSessionSimple) {
 
   string session_handle = "test_session_handle";
   TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def, true));
-  auto session = mgr_.WorkerSessionForSession(session_handle);
+  std::shared_ptr<WorkerSession> session;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession(session_handle, &session));
   EXPECT_NE(nullptr, session) << "Session for " << session_handle << "was null";
   EXPECT_NE(mgr_.LegacySession(), session);
   TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
@@ -81,22 +82,26 @@ TEST_F(SessionMgrTest, CreateSessionIsolateSessionState) {
   server_def.set_task_index(3);
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_1", server_def, false));
-  auto session_1 = mgr_.WorkerSessionForSession("handle_1");
+  std::shared_ptr<WorkerSession> session_1;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_1", &session_1));
   std::vector<Device*> devices_1 = session_1->device_mgr->ListDevices();
   EXPECT_EQ(1, devices_1.size());
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_2", server_def, false));
-  auto session_2 = mgr_.WorkerSessionForSession("handle_2");
+  std::shared_ptr<WorkerSession> session_2;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_2", &session_2));
   std::vector<Device*> devices_2 = session_2->device_mgr->ListDevices();
   EXPECT_EQ(1, devices_2.size());
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_3", server_def, true));
-  auto session_3 = mgr_.WorkerSessionForSession("handle_3");
+  std::shared_ptr<WorkerSession> session_3;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_3", &session_3));
   std::vector<Device*> devices_3 = session_3->device_mgr->ListDevices();
   EXPECT_EQ(1, devices_3.size());
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_4", server_def, true));
-  auto session_4 = mgr_.WorkerSessionForSession("handle_4");
+  std::shared_ptr<WorkerSession> session_4;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_4", &session_4));
   std::vector<Device*> devices_4 = session_4->device_mgr->ListDevices();
   EXPECT_EQ(1, devices_4.size());
 
@@ -109,12 +114,23 @@ TEST_F(SessionMgrTest, CreateSessionIsolateSessionState) {
 TEST_F(SessionMgrTest, LegacySession) {
   ServerDef server_def;
   string session_handle = "";
-  auto session = mgr_.WorkerSessionForSession(session_handle);
+  std::shared_ptr<WorkerSession> session;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession(session_handle, &session));
   EXPECT_EQ(mgr_.LegacySession(), session);
 
   TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
 }
 
+TEST_F(SessionMgrTest, UnknownSessionHandle) {
+  ServerDef server_def;
+  string session_handle = "unknown_session_handle";
+  std::shared_ptr<WorkerSession> session;
+  Status s = mgr_.WorkerSessionForSession(session_handle, &session);
+  EXPECT_TRUE(errors::IsAborted(s));
+  EXPECT_TRUE(
+      str_util::StrContains(s.error_message(), "Session handle is not found"));
+}
+
 TEST_F(SessionMgrTest, WorkerNameFromServerDef) {
   ServerDef server_def;
   server_def.set_job_name("worker");
@@ -124,7 +140,7 @@ TEST_F(SessionMgrTest, WorkerNameFromServerDef) {
 }
 
 TEST_F(SessionMgrTest, DeleteLegacySession) {
-  TF_EXPECT_OK(mgr_.DeleteSession("legacy_session"));
+  TF_EXPECT_OK(mgr_.DeleteSession(""));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 598652fb98..6b2536c3c0 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -59,21 +59,37 @@ void Worker::DeleteWorkerSessionAsync(const DeleteWorkerSessionRequest* request,
 void Worker::RegisterGraphAsync(const RegisterGraphRequest* request,
                                 RegisterGraphResponse* response,
                                 StatusCallback done) {
-  auto session =
-      env_->session_mgr->WorkerSessionForSession(request->session_handle());
-  Status s = session->graph_mgr->Register(
-      request->session_handle(), request->graph_def(), request->graph_options(),
-      request->debug_options(), session->cluster_flr.get(),
-      response->mutable_graph_handle());
+  std::shared_ptr<WorkerSession> session;
+  Status s;
+  if (request->create_worker_session_called()) {
+    s = env_->session_mgr->WorkerSessionForSession(request->session_handle(),
+                                                   &session);
+  } else {
+    session = env_->session_mgr->LegacySession();
+  }
+  if (s.ok()) {
+    s = session->graph_mgr->Register(
+        request->session_handle(), request->graph_def(),
+        request->graph_options(), request->debug_options(),
+        session->cluster_flr.get(), response->mutable_graph_handle());
+  }
   done(s);
 }
 
 void Worker::DeregisterGraphAsync(const DeregisterGraphRequest* request,
                                   DeregisterGraphResponse* response,
                                   StatusCallback done) {
-  auto session =
-      env_->session_mgr->WorkerSessionForSession(request->session_handle());
-  Status s = session->graph_mgr->Deregister(request->graph_handle());
+  std::shared_ptr<WorkerSession> session;
+  Status s;
+  if (request->create_worker_session_called()) {
+    s = env_->session_mgr->WorkerSessionForSession(request->session_handle(),
+                                                   &session);
+  } else {
+    session = env_->session_mgr->LegacySession();
+  }
+  if (s.ok()) {
+    s = session->graph_mgr->Deregister(request->graph_handle());
+  }
 
   done(s);
 }
@@ -135,11 +151,21 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
                         StatusCallback done) {
   const int64 step_id = request->step_id();
   TRACEPRINTF("RunGraph: %lld", step_id);
-  auto session =
-      env_->session_mgr->WorkerSessionForSession(request->session_handle());
+  std::shared_ptr<WorkerSession> session;
+  Status s;
+  if (request->create_worker_session_called()) {
+    s = env_->session_mgr->WorkerSessionForSession(request->session_handle(),
+                                                   &session);
+  } else {
+    session = env_->session_mgr->LegacySession();
+  }
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
   GraphMgr::NamedTensors in;
   GraphMgr::NamedTensors* out = new GraphMgr::NamedTensors;
-  Status s = PrepareRunGraph(request, &in, out);
+  s = PrepareRunGraph(request, &in, out);
   if (!s.ok()) {
     delete out;
     done(s);
@@ -209,12 +235,23 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
   const int64 step_id = request->step_id();
   const string& graph_handle = request->graph_handle();
   TRACEPRINTF("PartialRunGraph: %lld", step_id);
-  auto session =
-      env_->session_mgr->WorkerSessionForSession(request->session_handle());
+  std::shared_ptr<WorkerSession> session;
+
+  Status s;
+  if (request->create_worker_session_called()) {
+    s = env_->session_mgr->WorkerSessionForSession(request->session_handle(),
+                                                   &session);
+  } else {
+    session = env_->session_mgr->LegacySession();
+  }
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
 
   GraphMgr::NamedTensors in;
   GraphMgr::NamedTensors* out = new GraphMgr::NamedTensors;
-  Status s = PrepareRunGraph(request, &in, out);
+  s = PrepareRunGraph(request, &in, out);
   auto finish = [done, out, opts](const Status& s) {
     opts->ClearCancelCallback();
     delete out;
diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc
index cb7059b36e..18886babd5 100644
--- a/tensorflow/core/distributed_runtime/worker_session.cc
+++ b/tensorflow/core/distributed_runtime/worker_session.cc
@@ -97,6 +97,7 @@ WorkerSession::WorkerSession(const string& session_name,
       worker_cache(new WorkerFreeListCache(std::move(worker_cache))),
       device_mgr(std::move(device_mgr)),
       graph_mgr(std::move(graph_mgr)),
-      cluster_flr(new ClusterFunctionLibraryRuntime(this)) {}
+      cluster_flr(
+          new ClusterFunctionLibraryRuntime(this, !session_name.empty())) {}
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index 3e7289bd91..1819a35248 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -103,6 +103,9 @@ message RegisterGraphRequest {
   // Subgraphs are scoped within one session.
   string session_handle = 1;
 
+  // Set to true if `CreateWorkerSession` was called for `session_handle`.
+  bool create_worker_session_called = 6;
+
   // "graph_def" has the subgraph of nodes for this worker, with each node
   // having its device_name filled in.
   GraphDef graph_def = 2;
@@ -144,6 +147,9 @@ message DeregisterGraphRequest {
   // empty, a single global namespace is used.
   string session_handle = 2;
 
+  // Set to true if `CreateWorkerSession` was called for `session_handle`.
+  bool create_worker_session_called = 3;
+
   // REQUIRED: graph_handle must be returned by a RegisterGraph call
   // to the same WorkerService.
   string graph_handle = 1;
@@ -200,6 +206,9 @@ message RunGraphRequest {
   // search for the graph_handle.
   string session_handle = 8;
 
+  // Set to true if `CreateWorkerSession` was called for `session_handle`.
+  bool create_worker_session_called = 10;
+
   // REQUIRED: graph_handle must be returned by a RegisterGraph call
   // to the same WorkerService.
   string graph_handle = 1;
@@ -234,7 +243,7 @@ message RunGraphRequest {
   // truncate long metadata messages.
   bool store_errors_in_response_body = 9;
 
-  // Next: 10
+  // Next: 11
 }
 
 message RunGraphResponse {
-- 
GitLab


From 8c66f2223078dca765e7817f26f66e61fe819715 Mon Sep 17 00:00:00 2001
From: Dimitris Vardoulakis <dimvar@google.com>
Date: Wed, 18 Apr 2018 15:30:30 -0700
Subject: [PATCH 102/450] Automated g4 rollback of changelist 192180356

PiperOrigin-RevId: 193427566
---
 .../xla/service/algebraic_simplifier.cc       |  1 -
 .../compiler/xla/service/dfs_hlo_visitor.h    |  1 -
 .../service/dfs_hlo_visitor_with_default.h    |  3 ---
 .../xla/service/hlo_constant_folding.cc       |  3 +--
 .../compiler/xla/service/hlo_cost_analysis.cc |  5 ----
 .../compiler/xla/service/hlo_cost_analysis.h  |  1 -
 .../compiler/xla/service/hlo_graph_dumper.cc  |  1 -
 .../compiler/xla/service/hlo_instruction.cc   | 19 ++-----------
 .../compiler/xla/service/hlo_instruction.h    |  4 ---
 tensorflow/compiler/xla/service/hlo_opcode.h  |  1 -
 .../compiler/xla/service/hlo_verifier.cc      | 27 ++++---------------
 .../compiler/xla/service/hlo_verifier.h       |  1 -
 .../xla/service/instruction_fusion.cc         |  7 ++---
 .../compiler/xla/service/pattern_matcher.h    |  1 -
 .../compiler/xla/tools/parser/hlo_parser.cc   |  9 -------
 .../xla/tools/parser/hlo_parser_test.cc       | 12 ---------
 16 files changed, 10 insertions(+), 86 deletions(-)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 8d26938c6e..8e785de68c 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1412,7 +1412,6 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) {
   return Status::OK();
 }
 
-// TODO(b/74536353): do this simplification for BroadcastDimOne as well.
 StatusOr<bool> AlgebraicSimplifierVisitor::
     TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(
         HloInstruction* reshape_or_broadcast) {
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 3f7089d6ca..56723e7650 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -199,7 +199,6 @@ class DfsHloVisitorBase {
   virtual Status HandleReduce(HloInstructionPtr hlo) = 0;
   virtual Status HandleBitcast(HloInstructionPtr hlo) = 0;
   virtual Status HandleBroadcast(HloInstructionPtr hlo) = 0;
-  virtual Status HandleBroadcastDimOne(HloInstructionPtr hlo) = 0;
   virtual Status HandleReshape(HloInstructionPtr hlo) = 0;
   virtual Status HandleTranspose(HloInstructionPtr hlo) = 0;
   virtual Status HandleParameter(HloInstructionPtr hlo) = 0;
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index e6680ee9b8..240faebe62 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -158,9 +158,6 @@ class DfsHloVisitorWithDefaultBase
   Status HandleBroadcast(HloInstructionPtr broadcast) override {
     return DefaultAction(broadcast);
   }
-  Status HandleBroadcastDimOne(HloInstructionPtr broadcastDimOne) override {
-    return DefaultAction(broadcastDimOne);
-  }
   Status HandlePad(HloInstructionPtr pad) override {
     return DefaultAction(pad);
   }
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index 7aa38c6b79..35ecd4428d 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -69,8 +69,7 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
       // Broadcasts dramatically increase the size of constants, which is often
       // detrimental to performance and memory capacity, so do not fold
       // broadcasts.
-      if (instruction->opcode() == HloOpcode::kBroadcast ||
-          instruction->opcode() == HloOpcode::kBroadcastDimOne) {
+      if (instruction->opcode() == HloOpcode::kBroadcast) {
         continue;
       }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index ea4dd62fdb..44e4f75f75 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -336,11 +336,6 @@ Status HloCostAnalysis::HandleBroadcast(const HloInstruction*) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleBroadcastDimOne(
-    const HloInstruction* broadcastDimOne) {
-  return Status::OK();
-}
-
 Status HloCostAnalysis::HandlePad(const HloInstruction*) {
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index a9f6845747..d17678d20f 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -95,7 +95,6 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleSelectAndScatter(const HloInstruction* instruction) override;
   Status HandleBitcast(const HloInstruction* bitcast) override;
   Status HandleBroadcast(const HloInstruction* broadcast) override;
-  Status HandleBroadcastDimOne(const HloInstruction* broadcastDimOne) override;
   Status HandlePad(const HloInstruction* pad) override;
   Status HandleReshape(const HloInstruction* reshape) override;
   Status HandleTranspose(const HloInstruction* transpose) override;
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index c35783c456..25702dc65e 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -956,7 +956,6 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kTuple:
       return kWhite;
     case HloOpcode::kBroadcast:
-    case HloOpcode::kBroadcastDimOne:
       // De-emphasize nodes which broadcast a scalar within a fusion node --
       // these are essentially free.
       if (instr->IsFused() &&
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 56cb241087..a445380817 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -700,15 +700,6 @@ HloInstruction::CreateSelectAndScatter(
   return instruction;
 }
 
-/* static */ std::unique_ptr<HloInstruction>
-HloInstruction::CreateBroadcastDimOne(const Shape& shape,
-                                      HloInstruction* operand) {
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kBroadcastDimOne, shape));
-  instruction->AppendOperand(operand);
-  return instruction;
-}
-
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateBroadcastSequence(
     const Shape& output_shape, HloInstruction* operand,
@@ -1311,10 +1302,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       CHECK_EQ(new_operands.size(), 1);
       clone = CreateBroadcast(shape, new_operands[0], dimensions_);
       break;
-    case HloOpcode::kBroadcastDimOne:
-      CHECK_EQ(new_operands.size(), 1);
-      clone = CreateBroadcastDimOne(shape, new_operands[0]);
-      break;
     case HloOpcode::kCall:
       clone = CreateCall(shape, new_operands, to_apply());
       break;
@@ -1863,8 +1850,6 @@ bool HloInstruction::IdenticalSlowPath(
 
     // Remaining instructions with special values.
     case HloOpcode::kBitcast:
-    case HloOpcode::kBroadcastDimOne:
-    case HloOpcode::kDynamicUpdateSlice:
       return eq_shapes(shape(), other.shape());
     case HloOpcode::kBroadcast:
       return eq_shapes(shape(), other.shape()) &&
@@ -1883,6 +1868,8 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kDynamicSlice:
       return eq_shapes(shape(), other.shape()) &&
              dynamic_slice_sizes_ == other.dynamic_slice_sizes_;
+    case HloOpcode::kDynamicUpdateSlice:
+      return eq_shapes(shape(), other.shape());
     case HloOpcode::kCall:
     case HloOpcode::kMap:
       return eq_computations(to_apply(), other.to_apply());
@@ -2692,8 +2679,6 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleBitcast(this);
     case HloOpcode::kBroadcast:
       return visitor->HandleBroadcast(this);
-    case HloOpcode::kBroadcastDimOne:
-      return visitor->HandleBroadcastDimOne(this);
     case HloOpcode::kPad:
       return visitor->HandlePad(this);
     case HloOpcode::kReshape:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 49aa075029..5a7394f7a6 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -401,10 +401,6 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand,
       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
 
-  // Creates a broadcast-size-one-dimensions instruction.
-  static std::unique_ptr<HloInstruction> CreateBroadcastDimOne(
-      const Shape& shape, HloInstruction* operand);
-
   // Creates a sequence of instructions that performs an explicit broadcast of
   // the operand to the target shape.
   //
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index dddc72480f..af24604c39 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -54,7 +54,6 @@ namespace xla {
   V(kBitcast, "bitcast")                                     \
   V(kBitcastConvert, "bitcast-convert")                      \
   V(kBroadcast, "broadcast")                                 \
-  V(kBroadcastDimOne, "broadcast-dim-one")                   \
   V(kCall, "call", kHloOpcodeIsVariadic)                     \
   V(kCeil, "ceil")                                           \
   V(kClamp, "clamp")                                         \
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 63ec5964eb..8c875698eb 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -174,34 +174,17 @@ Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
   TF_RETURN_IF_ERROR(CheckShape(broadcast, broadcast->shape()));
   TF_RET_CHECK(ShapeUtil::Rank(operand_shape) ==
                broadcast->dimensions().size());
-  for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) {
-    int64 output_dimension = broadcast->dimensions()[i];
+  for (int64 operand_dimension = 0;
+       operand_dimension < ShapeUtil::Rank(operand_shape);
+       ++operand_dimension) {
+    int64 output_dimension = broadcast->dimensions()[operand_dimension];
     TF_RET_CHECK(broadcast->shape().dimensions(output_dimension) ==
-                 operand_shape.dimensions(i))
+                 operand_shape.dimensions(operand_dimension))
         << broadcast->ToString() << " operand shape " << operand_shape;
   }
   return tensorflow::Status::OK();
 }
 
-Status ShapeVerifier::HandleBroadcastDimOne(HloInstruction* broadcastDimOne) {
-  const Shape& operand_shape = broadcastDimOne->operand(0)->shape();
-  int64 operand_rank = ShapeUtil::Rank(operand_shape);
-  const Shape& output_shape = broadcastDimOne->shape();
-  // Check for mixed precision.
-  TF_RETURN_IF_ERROR(CheckShape(broadcastDimOne, output_shape));
-  TF_RET_CHECK(operand_rank == ShapeUtil::Rank(output_shape));
-  for (int64 i = 0; i < operand_rank; ++i) {
-    int64 operand_dimension = operand_shape.dimensions(i);
-    int64 output_dimension = output_shape.dimensions(i);
-    TF_RET_CHECK(operand_dimension == 1 ||
-                 operand_dimension == output_dimension)
-        << "Dimension " << i << " of broadcastDimOne "
-        << broadcastDimOne->ToString() << " is " << operand_dimension
-        << ", expected 1 or " << output_dimension;
-  }
-  return tensorflow::Status::OK();
-}
-
 Status ShapeVerifier::HandleReshape(HloInstruction* reshape) {
   // Check for mixed precision.
   TF_RETURN_IF_ERROR(CheckShape(reshape, reshape->shape()));
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index a4dff977ba..1dd7ec3c51 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -54,7 +54,6 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleReduce(HloInstruction* reduce) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
   Status HandleBroadcast(HloInstruction* broadcast) override;
-  Status HandleBroadcastDimOne(HloInstruction* broadcastDimOne) override;
   Status HandleReshape(HloInstruction* reshape) override;
   Status HandleTranspose(HloInstruction* transpose) override;
   Status HandleParameter(HloInstruction*) override;
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 3f4dbf897d..d69ad80bdb 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -37,7 +37,6 @@ namespace xla {
     case HloOpcode::kBitcast:
     case HloOpcode::kBitcastConvert:
     case HloOpcode::kBroadcast:
-    case HloOpcode::kBroadcastDimOne:
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
     case HloOpcode::kComplex:
@@ -143,8 +142,7 @@ bool InstructionFusion::EffectivelyUnary(HloInstruction* hlo) {
       });
   return std::count_if(hlo->operands().begin(), hlo->operands().end(),
                        [output_rank](HloInstruction* operand) {
-                         if (operand->opcode() == HloOpcode::kBroadcast ||
-                             operand->opcode() == HloOpcode::kBroadcastDimOne) {
+                         if (operand->opcode() == HloOpcode::kBroadcast) {
                            return false;
                          }
                          if (operand->opcode() == HloOpcode::kConstant &&
@@ -249,8 +247,7 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
     auto reachability = computation->ComputeReachability();
 
     auto cheap_to_duplicate = [this](HloInstruction* producer) {
-      if (producer->opcode() == HloOpcode::kBroadcast ||
-          producer->opcode() == HloOpcode::kBroadcastDimOne) {
+      if (producer->opcode() == HloOpcode::kBroadcast) {
         return true;
       }
       if (producer->opcode() == HloOpcode::kConstant &&
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index f5a4f2c9df..586f6ef7a9 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -879,7 +879,6 @@ XLA_UNOP_PATTERN(Abs)
 XLA_UNOP_PATTERN(RoundNearestAfz)
 XLA_UNOP_PATTERN(Bitcast)
 XLA_UNOP_PATTERN(Broadcast)
-XLA_UNOP_PATTERN(BroadcastDimOne)
 XLA_UNOP_PATTERN(Ceil)
 XLA_UNOP_PATTERN(Copy)
 XLA_UNOP_PATTERN(Cos)
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index b2f122982a..e60a5a4919 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -724,15 +724,6 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
           shape, operands[0], *broadcast_dimensions));
       break;
     }
-    case HloOpcode::kBroadcastDimOne: {
-      if (!ParseOperands(&operands, /*expected_size=*/1) ||
-          !ParseAttributes(attrs)) {
-        return false;
-      }
-      instruction = builder->AddInstruction(
-          HloInstruction::CreateBroadcastDimOne(shape, operands[0]));
-      break;
-    }
     case HloOpcode::kConcatenate: {
       optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
index 57684b5834..adc8b1d620 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
@@ -57,18 +57,6 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
 }
 
-)"
-},
-// broadcast size-one dimensions
-{
-"BroadcastDimOne",
-R"(HloModule broadcast_dim_one_module
-
-ENTRY %broadcast-dim-one () -> f32[2,2] {
-  %constant = f32[1,2]{1,0} constant(f32[1,2] { { 1.1, 2.2 } })
-  ROOT %broadcast-dim-one = f32[2,2]{1,0} broadcast-dim-one(f32[1,2]{1,0} %constant)
-}
-
 )"
 },
 // pred constant
-- 
GitLab


From 529c56d88f27337d6be263b6f61a2a7a1994bb2d Mon Sep 17 00:00:00 2001
From: Yifei Feng <1192265+yifeif@users.noreply.github.com>
Date: Wed, 18 Apr 2018 15:33:39 -0700
Subject: [PATCH 103/450] Add --test_output=errors as default

---
 tensorflow/tools/ci_build/ci_parameterized_build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 9d23b508aa..797e0a6db5 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -237,7 +237,7 @@ function get_cuda_capability_version() {
 CTYPE=${TF_BUILD_CONTAINER_TYPE}
 
 # Determine if the machine is a Mac
-OPT_FLAG=""
+OPT_FLAG="--test_output=errors"
 if [[ "$(uname -s)" == "Darwin" ]]; then
   DO_DOCKER=0
 
-- 
GitLab


From 427a458ae638b8488280019498e6ea5e238eb925 Mon Sep 17 00:00:00 2001
From: Dustin Tran <trandustin@google.com>
Date: Wed, 18 Apr 2018 15:38:38 -0700
Subject: [PATCH 104/450] Have TensorFlow Distributions share name scopes
 across method calls. END_PUBLIC

*** Reason for rollback ***

Roll forward, allowing distributions to have same names across objects.

*** Original change description ***

BEGIN_PUBLIC
Automated g4 rollback of changelist 190728742

PiperOrigin-RevId: 193428925
---
 .../python/kernel_tests/distribution_test.py  | 39 ++++++++++++++-----
 .../kernel_tests/mvn_full_covariance_test.py  |  2 +-
 .../python/ops/autoregressive.py              |  2 +-
 .../distributions/python/ops/binomial.py      |  2 +-
 .../distributions/python/ops/cauchy.py        |  2 +-
 .../contrib/distributions/python/ops/chi2.py  |  4 +-
 .../distributions/python/ops/deterministic.py |  2 +-
 .../distributions/python/ops/geometric.py     |  2 +-
 .../distributions/python/ops/gumbel.py        |  2 +-
 .../distributions/python/ops/half_normal.py   |  2 +-
 .../distributions/python/ops/independent.py   |  2 +-
 .../distributions/python/ops/inverse_gamma.py |  4 +-
 .../distributions/python/ops/kumaraswamy.py   |  9 +++--
 .../distributions/python/ops/logistic.py      |  2 +-
 .../distributions/python/ops/mixture.py       |  2 +-
 .../python/ops/mixture_same_family.py         |  2 +-
 .../distributions/python/ops/mvn_diag.py      |  4 +-
 .../python/ops/mvn_diag_plus_low_rank.py      |  2 +-
 .../python/ops/mvn_full_covariance.py         |  2 +-
 .../python/ops/mvn_linear_operator.py         |  2 +-
 .../distributions/python/ops/mvn_tril.py      |  2 +-
 .../python/ops/negative_binomial.py           |  2 +-
 .../python/ops/onehot_categorical.py          |  2 +-
 .../distributions/python/ops/poisson.py       |  2 +-
 .../python/ops/poisson_lognormal.py           |  2 +-
 .../python/ops/quantized_distribution.py      |  2 +-
 .../python/ops/relaxed_bernoulli.py           |  2 +-
 .../python/ops/relaxed_onehot_categorical.py  |  2 +-
 .../distributions/python/ops/sinh_arcsinh.py  |  3 +-
 .../python/ops/vector_diffeomixture.py        |  2 +-
 .../python/ops/vector_exponential_diag.py     |  2 +-
 .../ops/vector_exponential_linear_operator.py |  2 +-
 .../python/ops/vector_sinh_arcsinh_diag.py    |  2 +-
 .../python/ops/vector_student_t.py            |  2 +-
 .../distributions/python/ops/wishart.py       | 10 ++---
 .../python/ops/distributions/bernoulli.py     |  2 +-
 tensorflow/python/ops/distributions/beta.py   |  6 +--
 .../python/ops/distributions/categorical.py   |  2 +-
 .../python/ops/distributions/dirichlet.py     |  2 +-
 .../distributions/dirichlet_multinomial.py    |  2 +-
 .../python/ops/distributions/distribution.py  |  6 ++-
 .../python/ops/distributions/exponential.py   |  4 +-
 tensorflow/python/ops/distributions/gamma.py  |  4 +-
 .../python/ops/distributions/laplace.py       |  4 +-
 .../python/ops/distributions/multinomial.py   |  2 +-
 tensorflow/python/ops/distributions/normal.py |  4 +-
 .../python/ops/distributions/student_t.py     |  4 +-
 .../distributions/transformed_distribution.py |  2 +-
 .../python/ops/distributions/uniform.py       |  2 +-
 49 files changed, 100 insertions(+), 75 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
index 68e0d9cb82..f42feae25d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
@@ -190,11 +190,30 @@ class DistributionTest(test.TestCase):
       y = dist._set_sample_static_shape(x, sample_shape)
       self.assertTrue(y.get_shape().ndims is None)
 
+  def testNameScopeWorksCorrectly(self):
+    x = tfd.Normal(loc=0., scale=1., name="x")
+    x_duplicate = tfd.Normal(loc=0., scale=1., name="x")
+    with ops.name_scope("y") as name:
+      y = tfd.Bernoulli(logits=0., name=name)
+    x_sample = x.sample(name="custom_sample")
+    x_sample_duplicate = x.sample(name="custom_sample")
+    x_log_prob = x.log_prob(0., name="custom_log_prob")
+    x_duplicate_sample = x_duplicate.sample(name="custom_sample")
+
+    self.assertEqual(x.name, "x/")
+    self.assertEqual(x_duplicate.name, "x_1/")
+    self.assertEqual(y.name, "y/")
+    self.assertTrue(x_sample.name.startswith("x/custom_sample"))
+    self.assertTrue(x_sample_duplicate.name.startswith("x/custom_sample_1"))
+    self.assertTrue(x_log_prob.name.startswith("x/custom_log_prob"))
+    self.assertTrue(x_duplicate_sample.name.startswith(
+        "x_1/custom_sample"))
+
   def testStrWorksCorrectlyScalar(self):
     normal = tfd.Normal(loc=np.float16(0), scale=np.float16(1))
     self.assertEqual(
         ("tf.distributions.Normal("
-         "\"Normal\", "
+         "\"Normal/\", "
          "batch_shape=(), "
          "event_shape=(), "
          "dtype=float16)"),  # Got the dtype right.
@@ -203,7 +222,7 @@ class DistributionTest(test.TestCase):
     chi2 = tfd.Chi2(df=np.float32([1., 2.]), name="silly")
     self.assertEqual(
         ("tf.distributions.Chi2("
-         "\"silly\", "  # What a silly name that is!
+         "\"silly/\", "  # What a silly name that is!
          "batch_shape=(2,), "
          "event_shape=(), "
          "dtype=float32)"),
@@ -211,7 +230,7 @@ class DistributionTest(test.TestCase):
 
     exp = tfd.Exponential(rate=array_ops.placeholder(dtype=dtypes.float32))
     self.assertEqual(
-        ("tf.distributions.Exponential(\"Exponential\", "
+        ("tf.distributions.Exponential(\"Exponential/\", "
          # No batch shape.
          "event_shape=(), "
          "dtype=float32)"),
@@ -222,7 +241,7 @@ class DistributionTest(test.TestCase):
         loc=np.zeros([2, 2]), name="MVN")
     self.assertEqual(
         ("tf.distributions.MultivariateNormalDiag("
-         "\"MVN\", "
+         "\"MVN/\", "
          "batch_shape=(2,), "
          "event_shape=(2,), "
          "dtype=float64)"),
@@ -233,7 +252,7 @@ class DistributionTest(test.TestCase):
         name="MVN2")
     self.assertEqual(
         ("tf.distributions.MultivariateNormalDiag("
-         "\"MVN2\", "
+         "\"MVN2/\", "
          "batch_shape=(?,), "  # Partially known.
          "event_shape=(3,), "
          "dtype=float32)"),
@@ -243,7 +262,7 @@ class DistributionTest(test.TestCase):
     normal = tfd.Normal(loc=np.float16(0), scale=np.float16(1))
     self.assertEqual(
         ("<tf.distributions.Normal"
-         " 'Normal'"
+         " 'Normal/'"
          " batch_shape=()"
          " event_shape=()"
          " dtype=float16>"),  # Got the dtype right.
@@ -252,7 +271,7 @@ class DistributionTest(test.TestCase):
     chi2 = tfd.Chi2(df=np.float32([1., 2.]), name="silly")
     self.assertEqual(
         ("<tf.distributions.Chi2"
-         " 'silly'"  # What a silly name that is!
+         " 'silly/'"  # What a silly name that is!
          " batch_shape=(2,)"
          " event_shape=()"
          " dtype=float32>"),
@@ -261,7 +280,7 @@ class DistributionTest(test.TestCase):
     exp = tfd.Exponential(rate=array_ops.placeholder(dtype=dtypes.float32))
     self.assertEqual(
         ("<tf.distributions.Exponential"
-         " 'Exponential'"
+         " 'Exponential/'"
          " batch_shape=<unknown>"
          " event_shape=()"
          " dtype=float32>"),
@@ -272,7 +291,7 @@ class DistributionTest(test.TestCase):
         loc=np.zeros([2, 2]), name="MVN")
     self.assertEqual(
         ("<tf.distributions.MultivariateNormalDiag"
-         " 'MVN'"
+         " 'MVN/'"
          " batch_shape=(2,)"
          " event_shape=(2,)"
          " dtype=float64>"),
@@ -283,7 +302,7 @@ class DistributionTest(test.TestCase):
         name="MVN2")
     self.assertEqual(
         ("<tf.distributions.MultivariateNormalDiag"
-         " 'MVN2'"
+         " 'MVN2/'"
          " batch_shape=(?,)"  # Partially known.
          " event_shape=(3,)"
          " dtype=float32>"),
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
index 1a02fbefb8..7435bcbc68 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
@@ -52,7 +52,7 @@ class MultivariateNormalFullCovarianceTest(test.TestCase):
       mu = [1., 2.]
       sigma = [[1., 0.], [0., 1.]]
       mvn = ds.MultivariateNormalFullCovariance(mu, sigma, name="Billy")
-      self.assertEqual(mvn.name, "Billy")
+      self.assertEqual(mvn.name, "Billy/")
 
   def testDoesNotRaiseIfInitializedWithSymmetricMatrix(self):
     with self.test_session():
diff --git a/tensorflow/contrib/distributions/python/ops/autoregressive.py b/tensorflow/contrib/distributions/python/ops/autoregressive.py
index 69f3d57ff0..88ed012784 100644
--- a/tensorflow/contrib/distributions/python/ops/autoregressive.py
+++ b/tensorflow/contrib/distributions/python/ops/autoregressive.py
@@ -145,7 +145,7 @@ class Autoregressive(distribution_lib.Distribution):
       ValueError: if `num_steps < 1`.
     """
     parameters = locals()
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       self._distribution_fn = distribution_fn
       self._sample0 = sample0
       self._distribution0 = (distribution_fn() if sample0 is None
diff --git a/tensorflow/contrib/distributions/python/ops/binomial.py b/tensorflow/contrib/distributions/python/ops/binomial.py
index 6a1bb39ab2..12d1603178 100644
--- a/tensorflow/contrib/distributions/python/ops/binomial.py
+++ b/tensorflow/contrib/distributions/python/ops/binomial.py
@@ -164,7 +164,7 @@ class Binomial(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[total_count, logits, probs]):
+    with ops.name_scope(name, values=[total_count, logits, probs]) as name:
       self._total_count = self._maybe_assert_valid_total_count(
           ops.convert_to_tensor(total_count, name="total_count"),
           validate_args)
diff --git a/tensorflow/contrib/distributions/python/ops/cauchy.py b/tensorflow/contrib/distributions/python/ops/cauchy.py
index 6f5d724a2a..daacfe657f 100644
--- a/tensorflow/contrib/distributions/python/ops/cauchy.py
+++ b/tensorflow/contrib/distributions/python/ops/cauchy.py
@@ -121,7 +121,7 @@ class Cauchy(distribution.Distribution):
       TypeError: if `loc` and `scale` have different `dtype`.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)]
                                     if validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
diff --git a/tensorflow/contrib/distributions/python/ops/chi2.py b/tensorflow/contrib/distributions/python/ops/chi2.py
index e610f469e5..c77c5fd208 100644
--- a/tensorflow/contrib/distributions/python/ops/chi2.py
+++ b/tensorflow/contrib/distributions/python/ops/chi2.py
@@ -88,7 +88,7 @@ class Chi2(gamma.Gamma):
     # not true in the parent class "gamma."  therefore, passing
     # allow_nan_stats=True
     # through to the parent class results in unnecessary asserts.
-    with ops.name_scope(name, values=[df]):
+    with ops.name_scope(name, values=[df]) as name:
       with ops.control_dependencies([
           check_ops.assert_positive(df),
       ] if validate_args else []):
@@ -120,7 +120,7 @@ class Chi2WithAbsDf(Chi2):
                allow_nan_stats=True,
                name="Chi2WithAbsDf"):
     parameters = locals()
-    with ops.name_scope(name, values=[df]):
+    with ops.name_scope(name, values=[df]) as name:
       super(Chi2WithAbsDf, self).__init__(
           df=math_ops.floor(
               math_ops.abs(df, name="abs_df"),
diff --git a/tensorflow/contrib/distributions/python/ops/deterministic.py b/tensorflow/contrib/distributions/python/ops/deterministic.py
index 8049522e9f..a42350430e 100644
--- a/tensorflow/contrib/distributions/python/ops/deterministic.py
+++ b/tensorflow/contrib/distributions/python/ops/deterministic.py
@@ -87,7 +87,7 @@ class _BaseDeterministic(distribution.Distribution):
       ValueError:  If `loc` is a scalar.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, atol, rtol]):
+    with ops.name_scope(name, values=[loc, atol, rtol]) as name:
       loc = ops.convert_to_tensor(loc, name="loc")
       if is_vector and validate_args:
         msg = "Argument loc must be at least rank 1."
diff --git a/tensorflow/contrib/distributions/python/ops/geometric.py b/tensorflow/contrib/distributions/python/ops/geometric.py
index 8f190e48a7..53dd42f4c8 100644
--- a/tensorflow/contrib/distributions/python/ops/geometric.py
+++ b/tensorflow/contrib/distributions/python/ops/geometric.py
@@ -86,7 +86,7 @@ class Geometric(distribution.Distribution):
     """
 
     parameters = locals()
-    with ops.name_scope(name, values=[logits, probs]):
+    with ops.name_scope(name, values=[logits, probs]) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits, probs, validate_args=validate_args, name=name)
 
diff --git a/tensorflow/contrib/distributions/python/ops/gumbel.py b/tensorflow/contrib/distributions/python/ops/gumbel.py
index 8d05ad6b80..2c261073ee 100644
--- a/tensorflow/contrib/distributions/python/ops/gumbel.py
+++ b/tensorflow/contrib/distributions/python/ops/gumbel.py
@@ -125,7 +125,7 @@ class _Gumbel(distribution.Distribution):
       TypeError: if loc and scale are different dtypes.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
diff --git a/tensorflow/contrib/distributions/python/ops/half_normal.py b/tensorflow/contrib/distributions/python/ops/half_normal.py
index fc0751a6e0..d0df2befd6 100644
--- a/tensorflow/contrib/distributions/python/ops/half_normal.py
+++ b/tensorflow/contrib/distributions/python/ops/half_normal.py
@@ -106,7 +106,7 @@ class HalfNormal(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[scale]):
+    with ops.name_scope(name, values=[scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
         self._scale = array_ops.identity(scale, name="scale")
diff --git a/tensorflow/contrib/distributions/python/ops/independent.py b/tensorflow/contrib/distributions/python/ops/independent.py
index b1bacb91b0..fbde55ef31 100644
--- a/tensorflow/contrib/distributions/python/ops/independent.py
+++ b/tensorflow/contrib/distributions/python/ops/independent.py
@@ -119,7 +119,7 @@ class Independent(distribution_lib.Distribution):
     parameters = locals()
     name = name or "Independent" + distribution.name
     self._distribution = distribution
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       if reinterpreted_batch_ndims is None:
         reinterpreted_batch_ndims = self._get_default_reinterpreted_batch_ndims(
             distribution)
diff --git a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
index 51ac61dcf6..502bd4f493 100644
--- a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
@@ -126,7 +126,7 @@ class InverseGamma(distribution.Distribution):
       TypeError: if `concentration` and `rate` are different dtypes.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[concentration, rate]):
+    with ops.name_scope(name, values=[concentration, rate]) as name:
       with ops.control_dependencies([
           check_ops.assert_positive(concentration),
           check_ops.assert_positive(rate),
@@ -281,7 +281,7 @@ class InverseGammaWithSoftplusConcentrationRate(InverseGamma):
                allow_nan_stats=True,
                name="InverseGammaWithSoftplusConcentrationRate"):
     parameters = locals()
-    with ops.name_scope(name, values=[concentration, rate]):
+    with ops.name_scope(name, values=[concentration, rate]) as name:
       super(InverseGammaWithSoftplusConcentrationRate, self).__init__(
           concentration=nn.softplus(concentration,
                                     name="softplus_concentration"),
diff --git a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
index 192dede6ff..66682b2ff5 100644
--- a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
+++ b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
@@ -151,10 +151,11 @@ class Kumaraswamy(transformed_distribution.TransformedDistribution):
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    concentration1 = ops.convert_to_tensor(
-        concentration1, name="concentration1")
-    concentration0 = ops.convert_to_tensor(
-        concentration0, name="concentration0")
+    with ops.name_scope(name, values=[concentration1, concentration0]) as name:
+      concentration1 = ops.convert_to_tensor(
+          concentration1, name="concentration1")
+      concentration0 = ops.convert_to_tensor(
+          concentration0, name="concentration0")
     super(Kumaraswamy, self).__init__(
         distribution=uniform.Uniform(
             low=array_ops.zeros([], dtype=concentration1.dtype),
diff --git a/tensorflow/contrib/distributions/python/ops/logistic.py b/tensorflow/contrib/distributions/python/ops/logistic.py
index 68e6bca5a5..c83b5bc2e3 100644
--- a/tensorflow/contrib/distributions/python/ops/logistic.py
+++ b/tensorflow/contrib/distributions/python/ops/logistic.py
@@ -120,7 +120,7 @@ class Logistic(distribution.Distribution):
       TypeError: if loc and scale are different dtypes.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
diff --git a/tensorflow/contrib/distributions/python/ops/mixture.py b/tensorflow/contrib/distributions/python/ops/mixture.py
index cef6a143fc..2ef294af2e 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture.py
@@ -145,7 +145,7 @@ class Mixture(distribution.Distribution):
           "none of the components provide a static number of ndims")
 
     # Ensure that all batch and event ndims are consistent.
-    with ops.name_scope(name, values=[cat.logits]):
+    with ops.name_scope(name, values=[cat.logits]) as name:
       num_components = cat.event_size
       static_num_components = tensor_util.constant_value(num_components)
       if static_num_components is None:
diff --git a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
index b93bdc5ab4..0b1301e551 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
@@ -131,7 +131,7 @@ class MixtureSameFamily(distribution.Distribution):
         `components_distribution` rightmost batch shape.
     """
     parameters = locals()
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       self._mixture_distribution = mixture_distribution
       self._components_distribution = components_distribution
       self._runtime_assertions = []
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag.py b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
index e862552880..e3236c2db9 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
@@ -194,7 +194,7 @@ class MultivariateNormalDiag(
       ValueError: if at most `scale_identity_multiplier` is specified.
     """
     parameters = locals()
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[
           loc, scale_diag, scale_identity_multiplier]):
         # No need to validate_args while making diag_scale.  The returned
@@ -225,7 +225,7 @@ class MultivariateNormalDiagWithSoftplusScale(MultivariateNormalDiag):
                allow_nan_stats=True,
                name="MultivariateNormalDiagWithSoftplusScale"):
     parameters = locals()
-    with ops.name_scope(name, values=[scale_diag]):
+    with ops.name_scope(name, values=[scale_diag]) as name:
       super(MultivariateNormalDiagWithSoftplusScale, self).__init__(
           loc=loc,
           scale_diag=nn.softplus(scale_diag),
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
index 413e88f03a..2f6a6f198c 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
@@ -218,7 +218,7 @@ class MultivariateNormalDiagPlusLowRank(
     parameters = locals()
     def _convert_to_tensor(x, name):
       return None if x is None else ops.convert_to_tensor(x, name=name)
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[
           loc, scale_diag, scale_identity_multiplier, scale_perturb_factor,
           scale_perturb_diag]):
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
index 4bea99fbb7..86fcd4db54 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
@@ -159,7 +159,7 @@ class MultivariateNormalFullCovariance(mvn_tril.MultivariateNormalTriL):
     parameters = locals()
 
     # Convert the covariance_matrix up to a scale_tril and call MVNTriL.
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[loc, covariance_matrix]):
         if covariance_matrix is None:
           scale_tril = None
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
index a739979289..44c92312c7 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
@@ -176,7 +176,7 @@ class MultivariateNormalLinearOperator(
     if not scale.dtype.is_floating:
       raise TypeError("`scale` parameter must have floating-point dtype.")
 
-    with ops.name_scope(name, values=[loc] + scale.graph_parents):
+    with ops.name_scope(name, values=[loc] + scale.graph_parents) as name:
       # Since expand_dims doesn't preserve constant-ness, we obtain the
       # non-dynamic value if possible.
       loc = ops.convert_to_tensor(loc, name="loc") if loc is not None else loc
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_tril.py b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
index 6c7dc4ca7a..d6f8b731cb 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_tril.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
@@ -184,7 +184,7 @@ class MultivariateNormalTriL(
       return None if x is None else ops.convert_to_tensor(x, name=name)
     if loc is None and scale_tril is None:
       raise ValueError("Must specify one or both of `loc`, `scale_tril`.")
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[loc, scale_tril]):
         loc = _convert_to_tensor(loc, name="loc")
         scale_tril = _convert_to_tensor(scale_tril, name="scale_tril")
diff --git a/tensorflow/contrib/distributions/python/ops/negative_binomial.py b/tensorflow/contrib/distributions/python/ops/negative_binomial.py
index 3a58df80da..eeaf9c0a5e 100644
--- a/tensorflow/contrib/distributions/python/ops/negative_binomial.py
+++ b/tensorflow/contrib/distributions/python/ops/negative_binomial.py
@@ -91,7 +91,7 @@ class NegativeBinomial(distribution.Distribution):
     """
 
     parameters = locals()
-    with ops.name_scope(name, values=[total_count, logits, probs]):
+    with ops.name_scope(name, values=[total_count, logits, probs]) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits, probs, validate_args=validate_args, name=name)
       with ops.control_dependencies(
diff --git a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
index e3e40b2e9c..305b138fdc 100644
--- a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
@@ -116,7 +116,7 @@ class OneHotCategorical(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[logits, probs]):
+    with ops.name_scope(name, values=[logits, probs]) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           name=name, logits=logits, probs=probs, validate_args=validate_args,
           multidimensional=True)
diff --git a/tensorflow/contrib/distributions/python/ops/poisson.py b/tensorflow/contrib/distributions/python/ops/poisson.py
index 02e97c0a2f..a84aad6fc9 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson.py
@@ -94,7 +94,7 @@ class Poisson(distribution.Distribution):
       TypeError: if `log_rate` is not a float-type.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[rate]):
+    with ops.name_scope(name, values=[rate]) as name:
       if (rate is None) == (log_rate is None):
         raise ValueError("Must specify exactly one of `rate` and `log_rate`.")
       elif log_rate is None:
diff --git a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
index 3314181898..19c99dcee9 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
@@ -256,7 +256,7 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
         `dtype`.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       if loc is not None:
         loc = ops.convert_to_tensor(loc, name="loc")
       if scale is not None:
diff --git a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
index 8aebb79b91..1ef7651d03 100644
--- a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
@@ -217,7 +217,7 @@ class QuantizedDistribution(distributions.Distribution):
     values = (
         list(distribution.parameters.values()) +
         [low, high])
-    with ops.name_scope(name, values=values):
+    with ops.name_scope(name, values=values) as name:
       self._dist = distribution
 
       if low is not None:
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
index e454a53c62..84c8d29072 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
@@ -166,7 +166,7 @@ class RelaxedBernoulli(transformed_distribution.TransformedDistribution):
       ValueError: If both `probs` and `logits` are passed, or if neither.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[logits, probs, temperature]):
+    with ops.name_scope(name, values=[logits, probs, temperature]) as name:
       with ops.control_dependencies([check_ops.assert_positive(temperature)]
                                     if validate_args else []):
         self._temperature = array_ops.identity(temperature, name="temperature")
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
index 02cf3c7992..325f41e37c 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
@@ -163,7 +163,7 @@ class ExpRelaxedOneHotCategorical(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[logits, probs, temperature]):
+    with ops.name_scope(name, values=[logits, probs, temperature]) as name:
 
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           name=name, logits=logits, probs=probs, validate_args=validate_args,
diff --git a/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py b/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
index cde6d85500..03828fa612 100644
--- a/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
+++ b/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
@@ -134,7 +134,8 @@ class SinhArcsinh(transformed_distribution.TransformedDistribution):
     """
     parameters = locals()
 
-    with ops.name_scope(name, values=[loc, scale, skewness, tailweight]):
+    with ops.name_scope(name,
+                        values=[loc, scale, skewness, tailweight]) as name:
       loc = ops.convert_to_tensor(loc, name="loc")
       dtype = loc.dtype
       scale = ops.convert_to_tensor(scale, name="scale", dtype=dtype)
diff --git a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
index da271a852d..af6ff8162b 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
@@ -396,7 +396,7 @@ class VectorDiffeomixture(distribution_lib.Distribution):
       ValueError: if `not distribution.is_scalar_event`.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[mix_loc, temperature]):
+    with ops.name_scope(name, values=[mix_loc, temperature]) as name:
       if not scale or len(scale) < 2:
         raise ValueError("Must specify list (or list-like object) of scale "
                          "LinearOperators, one for each component with "
diff --git a/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py b/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
index 526fe2d39a..e265b5d0f7 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
@@ -176,7 +176,7 @@ class VectorExponentialDiag(
       ValueError: if at most `scale_identity_multiplier` is specified.
     """
     parameters = locals()
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[
           loc, scale_diag, scale_identity_multiplier]):
         # No need to validate_args while making diag_scale.  The returned
diff --git a/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
index 9d5fd9ac41..89136d6760 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
@@ -181,7 +181,7 @@ class VectorExponentialLinearOperator(
     if not scale.dtype.is_floating:
       raise TypeError("`scale` parameter must have floating-point dtype.")
 
-    with ops.name_scope(name, values=[loc] + scale.graph_parents):
+    with ops.name_scope(name, values=[loc] + scale.graph_parents) as name:
       # Since expand_dims doesn't preserve constant-ness, we obtain the
       # non-dynamic value if possible.
       loc = ops.convert_to_tensor(loc, name="loc") if loc is not None else loc
diff --git a/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py b/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
index 05919be124..1438ede265 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
@@ -169,7 +169,7 @@ class VectorSinhArcsinhDiag(transformed_distribution.TransformedDistribution):
         name,
         values=[
             loc, scale_diag, scale_identity_multiplier, skewness, tailweight
-        ]):
+        ]) as name:
       loc = ops.convert_to_tensor(loc, name="loc") if loc is not None else loc
       tailweight = 1. if tailweight is None else tailweight
       has_default_skewness = skewness is None
diff --git a/tensorflow/contrib/distributions/python/ops/vector_student_t.py b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
index 887981d64e..7e78ded9df 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_student_t.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
@@ -178,7 +178,7 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
     parameters = locals()
     graph_parents = [df, loc, scale_identity_multiplier, scale_diag,
                      scale_tril, scale_perturb_factor, scale_perturb_diag]
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=graph_parents):
         # The shape of the _VectorStudentT distribution is governed by the
         # relationship between df.batch_shape and affine.batch_shape. In
diff --git a/tensorflow/contrib/distributions/python/ops/wishart.py b/tensorflow/contrib/distributions/python/ops/wishart.py
index 5a8c94dabf..91453fed5d 100644
--- a/tensorflow/contrib/distributions/python/ops/wishart.py
+++ b/tensorflow/contrib/distributions/python/ops/wishart.py
@@ -109,7 +109,7 @@ class _WishartLinearOperator(distribution.Distribution):
     """
     parameters = locals()
     self._cholesky_input_output_matrices = cholesky_input_output_matrices
-    with ops.name_scope(name) as ns:
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[df, scale_operator]):
         if not scale_operator.dtype.is_floating:
           raise TypeError(
@@ -163,7 +163,7 @@ class _WishartLinearOperator(distribution.Distribution):
         parameters=parameters,
         graph_parents=([self._df, self._dimension] +
                        self._scale_operator.graph_parents),
-        name=ns)
+        name=name)
 
   @property
   def df(self):
@@ -531,7 +531,7 @@ class WishartCholesky(_WishartLinearOperator):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[scale]):
+    with ops.name_scope(name, values=[scale]) as name:
       with ops.name_scope("init", values=[scale]):
         scale = ops.convert_to_tensor(scale)
         if validate_args:
@@ -647,7 +647,7 @@ class WishartFull(_WishartLinearOperator):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name) as ns:
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[scale]):
         scale = ops.convert_to_tensor(scale)
         if validate_args:
@@ -666,5 +666,5 @@ class WishartFull(_WishartLinearOperator):
         cholesky_input_output_matrices=cholesky_input_output_matrices,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
-        name=ns)
+        name=name)
     self._parameters = parameters
diff --git a/tensorflow/python/ops/distributions/bernoulli.py b/tensorflow/python/ops/distributions/bernoulli.py
index 68aaf3815e..2c9f0e9a32 100644
--- a/tensorflow/python/ops/distributions/bernoulli.py
+++ b/tensorflow/python/ops/distributions/bernoulli.py
@@ -72,7 +72,7 @@ class Bernoulli(distribution.Distribution):
       ValueError: If p and logits are passed, or if neither are passed.
     """
     parameters = locals()
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits=logits,
           probs=probs,
diff --git a/tensorflow/python/ops/distributions/beta.py b/tensorflow/python/ops/distributions/beta.py
index 469bcadb8e..8beab99bf8 100644
--- a/tensorflow/python/ops/distributions/beta.py
+++ b/tensorflow/python/ops/distributions/beta.py
@@ -151,7 +151,7 @@ class Beta(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[concentration1, concentration0]):
+    with ops.name_scope(name, values=[concentration1, concentration0]) as name:
       self._concentration1 = self._maybe_assert_valid_concentration(
           ops.convert_to_tensor(concentration1, name="concentration1"),
           validate_args)
@@ -323,7 +323,7 @@ class BetaWithSoftplusConcentration(Beta):
                name="BetaWithSoftplusConcentration"):
     parameters = locals()
     with ops.name_scope(name, values=[concentration1,
-                                      concentration0]) as ns:
+                                      concentration0]) as name:
       super(BetaWithSoftplusConcentration, self).__init__(
           concentration1=nn.softplus(concentration1,
                                      name="softplus_concentration1"),
@@ -331,7 +331,7 @@ class BetaWithSoftplusConcentration(Beta):
                                      name="softplus_concentration0"),
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
-          name=ns)
+          name=name)
     self._parameters = parameters
 
 
diff --git a/tensorflow/python/ops/distributions/categorical.py b/tensorflow/python/ops/distributions/categorical.py
index 9161e3fa9f..66fa9e110c 100644
--- a/tensorflow/python/ops/distributions/categorical.py
+++ b/tensorflow/python/ops/distributions/categorical.py
@@ -183,7 +183,7 @@ class Categorical(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[logits, probs]):
+    with ops.name_scope(name, values=[logits, probs]) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits=logits,
           probs=probs,
diff --git a/tensorflow/python/ops/distributions/dirichlet.py b/tensorflow/python/ops/distributions/dirichlet.py
index 25afeec936..eafcd5c78f 100644
--- a/tensorflow/python/ops/distributions/dirichlet.py
+++ b/tensorflow/python/ops/distributions/dirichlet.py
@@ -155,7 +155,7 @@ class Dirichlet(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[concentration]):
+    with ops.name_scope(name, values=[concentration]) as name:
       self._concentration = self._maybe_assert_valid_concentration(
           ops.convert_to_tensor(concentration, name="concentration"),
           validate_args)
diff --git a/tensorflow/python/ops/distributions/dirichlet_multinomial.py b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
index 03a98c56ba..fe0ed7e07d 100644
--- a/tensorflow/python/ops/distributions/dirichlet_multinomial.py
+++ b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
@@ -192,7 +192,7 @@ class DirichletMultinomial(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[total_count, concentration]):
+    with ops.name_scope(name, values=[total_count, concentration]) as name:
       # Broadcasting works because:
       # * The broadcasting convention is to prepend dimensions of size [1], and
       #   we use the last dimension for the distribution, whereas
diff --git a/tensorflow/python/ops/distributions/distribution.py b/tensorflow/python/ops/distributions/distribution.py
index 7c43bf54fc..3815abf72d 100644
--- a/tensorflow/python/ops/distributions/distribution.py
+++ b/tensorflow/python/ops/distributions/distribution.py
@@ -434,13 +434,17 @@ class Distribution(_BaseDistribution):
     for i, t in enumerate(graph_parents):
       if t is None or not tensor_util.is_tensor(t):
         raise ValueError("Graph parent item %d is not a Tensor; %s." % (i, t))
+    if not name or name[-1] != "/":  # `name` is not a name scope
+      non_unique_name = name or type(self).__name__
+      with ops.name_scope(non_unique_name) as name:
+        pass
     self._dtype = dtype
     self._reparameterization_type = reparameterization_type
     self._allow_nan_stats = allow_nan_stats
     self._validate_args = validate_args
     self._parameters = parameters or {}
     self._graph_parents = graph_parents
-    self._name = name or type(self).__name__
+    self._name = name
 
   @classmethod
   def param_shapes(cls, sample_shape, name="DistributionParamShapes"):
diff --git a/tensorflow/python/ops/distributions/exponential.py b/tensorflow/python/ops/distributions/exponential.py
index 6345a76d48..cf0e729e1a 100644
--- a/tensorflow/python/ops/distributions/exponential.py
+++ b/tensorflow/python/ops/distributions/exponential.py
@@ -95,7 +95,7 @@ class Exponential(gamma.Gamma):
     # true in the parent class "Gamma."  Therefore, passing
     # allow_nan_stats=True
     # through to the parent class results in unnecessary asserts.
-    with ops.name_scope(name, values=[rate]):
+    with ops.name_scope(name, values=[rate]) as name:
       self._rate = ops.convert_to_tensor(rate, name="rate")
     super(Exponential, self).__init__(
         concentration=array_ops.ones([], dtype=self._rate.dtype),
@@ -144,7 +144,7 @@ class ExponentialWithSoftplusRate(Exponential):
                allow_nan_stats=True,
                name="ExponentialWithSoftplusRate"):
     parameters = locals()
-    with ops.name_scope(name, values=[rate]):
+    with ops.name_scope(name, values=[rate]) as name:
       super(ExponentialWithSoftplusRate, self).__init__(
           rate=nn.softplus(rate, name="softplus_rate"),
           validate_args=validate_args,
diff --git a/tensorflow/python/ops/distributions/gamma.py b/tensorflow/python/ops/distributions/gamma.py
index adb1f4f9a8..d39f7c56d3 100644
--- a/tensorflow/python/ops/distributions/gamma.py
+++ b/tensorflow/python/ops/distributions/gamma.py
@@ -127,7 +127,7 @@ class Gamma(distribution.Distribution):
       TypeError: if `concentration` and `rate` are different dtypes.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[concentration, rate]):
+    with ops.name_scope(name, values=[concentration, rate]) as name:
       with ops.control_dependencies([
           check_ops.assert_positive(concentration),
           check_ops.assert_positive(rate),
@@ -262,7 +262,7 @@ class GammaWithSoftplusConcentrationRate(Gamma):
                allow_nan_stats=True,
                name="GammaWithSoftplusConcentrationRate"):
     parameters = locals()
-    with ops.name_scope(name, values=[concentration, rate]):
+    with ops.name_scope(name, values=[concentration, rate]) as name:
       super(GammaWithSoftplusConcentrationRate, self).__init__(
           concentration=nn.softplus(concentration,
                                     name="softplus_concentration"),
diff --git a/tensorflow/python/ops/distributions/laplace.py b/tensorflow/python/ops/distributions/laplace.py
index e98ac855c5..3ccfc618d1 100644
--- a/tensorflow/python/ops/distributions/laplace.py
+++ b/tensorflow/python/ops/distributions/laplace.py
@@ -101,7 +101,7 @@ class Laplace(distribution.Distribution):
       TypeError: if `loc` and `scale` are of different dtype.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
@@ -218,7 +218,7 @@ class LaplaceWithSoftplusScale(Laplace):
                allow_nan_stats=True,
                name="LaplaceWithSoftplusScale"):
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       super(LaplaceWithSoftplusScale, self).__init__(
           loc=loc,
           scale=nn.softplus(scale, name="softplus_scale"),
diff --git a/tensorflow/python/ops/distributions/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py
index 4ae67a009b..ab77f5c1f8 100644
--- a/tensorflow/python/ops/distributions/multinomial.py
+++ b/tensorflow/python/ops/distributions/multinomial.py
@@ -183,7 +183,7 @@ class Multinomial(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[total_count, logits, probs]):
+    with ops.name_scope(name, values=[total_count, logits, probs]) as name:
       self._total_count = ops.convert_to_tensor(total_count, name="total_count")
       if validate_args:
         self._total_count = (
diff --git a/tensorflow/python/ops/distributions/normal.py b/tensorflow/python/ops/distributions/normal.py
index 32e8a49c81..20d4420e91 100644
--- a/tensorflow/python/ops/distributions/normal.py
+++ b/tensorflow/python/ops/distributions/normal.py
@@ -132,7 +132,7 @@ class Normal(distribution.Distribution):
       TypeError: if `loc` and `scale` have different `dtype`.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
@@ -244,7 +244,7 @@ class NormalWithSoftplusScale(Normal):
                allow_nan_stats=True,
                name="NormalWithSoftplusScale"):
     parameters = locals()
-    with ops.name_scope(name, values=[scale]):
+    with ops.name_scope(name, values=[scale]) as name:
       super(NormalWithSoftplusScale, self).__init__(
           loc=loc,
           scale=nn.softplus(scale, name="softplus_scale"),
diff --git a/tensorflow/python/ops/distributions/student_t.py b/tensorflow/python/ops/distributions/student_t.py
index 9d9e65b4e8..961b07a7bd 100644
--- a/tensorflow/python/ops/distributions/student_t.py
+++ b/tensorflow/python/ops/distributions/student_t.py
@@ -158,7 +158,7 @@ class StudentT(distribution.Distribution):
       TypeError: if loc and scale are different dtypes.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[df, loc, scale]):
+    with ops.name_scope(name, values=[df, loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(df)]
                                     if validate_args else []):
         self._df = array_ops.identity(df, name="df")
@@ -350,7 +350,7 @@ class StudentTWithAbsDfSoftplusScale(StudentT):
                allow_nan_stats=True,
                name="StudentTWithAbsDfSoftplusScale"):
     parameters = locals()
-    with ops.name_scope(name, values=[df, scale]):
+    with ops.name_scope(name, values=[df, scale]) as name:
       super(StudentTWithAbsDfSoftplusScale, self).__init__(
           df=math_ops.floor(math_ops.abs(df)),
           loc=loc,
diff --git a/tensorflow/python/ops/distributions/transformed_distribution.py b/tensorflow/python/ops/distributions/transformed_distribution.py
index 1ad63a8cf6..6aa6ec40d9 100644
--- a/tensorflow/python/ops/distributions/transformed_distribution.py
+++ b/tensorflow/python/ops/distributions/transformed_distribution.py
@@ -257,7 +257,7 @@ class TransformedDistribution(distribution_lib.Distribution):
     parameters = locals()
     name = name or (("" if bijector is None else bijector.name) +
                     distribution.name)
-    with ops.name_scope(name, values=[event_shape, batch_shape]):
+    with ops.name_scope(name, values=[event_shape, batch_shape]) as name:
       # For convenience we define some handy constants.
       self._zero = constant_op.constant(0, dtype=dtypes.int32, name="zero")
       self._empty = constant_op.constant([], dtype=dtypes.int32, name="empty")
diff --git a/tensorflow/python/ops/distributions/uniform.py b/tensorflow/python/ops/distributions/uniform.py
index 0891bffdd5..087797c653 100644
--- a/tensorflow/python/ops/distributions/uniform.py
+++ b/tensorflow/python/ops/distributions/uniform.py
@@ -103,7 +103,7 @@ class Uniform(distribution.Distribution):
       InvalidArgumentError: if `low >= high` and `validate_args=False`.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[low, high]):
+    with ops.name_scope(name, values=[low, high]) as name:
       with ops.control_dependencies([
           check_ops.assert_less(
               low, high, message="uniform not defined when low >= high.")
-- 
GitLab


From 40e16d6301ee0c1334ce514350668a16d7debd9a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 15:47:12 -0700
Subject: [PATCH 105/450] Remove duplicate code.

PiperOrigin-RevId: 193430279
---
 tensorflow/contrib/autograph/impl/naming.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/contrib/autograph/impl/naming.py b/tensorflow/contrib/autograph/impl/naming.py
index 1facaa0ca0..b1d3f76be7 100644
--- a/tensorflow/contrib/autograph/impl/naming.py
+++ b/tensorflow/contrib/autograph/impl/naming.py
@@ -62,8 +62,6 @@ class Namer(object):
       n += 1
       new_name = '%s_%d' % (new_name_root, n)
 
-    if live_entity is not None:
-      self.renamed_calls[live_entity] = new_name
     self.generated_names.add(new_name)
     if live_entity is not None:
       self.renamed_calls[live_entity] = new_name
-- 
GitLab


From 695da2d928b5927c0a4f73e352a597a19886f2cb Mon Sep 17 00:00:00 2001
From: HyoukJoong Lee <hyouklee@google.com>
Date: Wed, 18 Apr 2018 15:57:53 -0700
Subject: [PATCH 106/450] Disable failing test RGBToHSVTest.testBatch

PiperOrigin-RevId: 193431888
---
 tensorflow/compiler/tests/image_ops_test.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index 12791ef8ac..5b19e993ec 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -37,6 +37,10 @@ from tensorflow.python.platform import test
 class RGBToHSVTest(XLATestCase):
 
   def testBatch(self):
+    # TODO(b/78230407): Reenable the test on GPU.
+    if self.device == "XLA_GPU":
+      return
+
     # Build an arbitrary RGB image
     np.random.seed(7)
     batch_size = 5
-- 
GitLab


From e9d47fbff0d644a75c6f3dcdcb852685ef515b64 Mon Sep 17 00:00:00 2001
From: Shivani Agrawal <shivaniagrawal@google.com>
Date: Wed, 18 Apr 2018 16:01:55 -0700
Subject: [PATCH 107/450] Adds dataset transformation function
 `set_stats_aggregator(..)`, which sets the given `stats_aggregator` for
 aggregating the input dataset stats.

PiperOrigin-RevId: 193432590
---
 .../kernel_tests/stats_dataset_ops_test.py    |  67 ++++-----
 .../contrib/data/python/ops/stats_ops.py      |  61 +++++---
 tensorflow/core/BUILD                         |   1 +
 .../api_def_IteratorSetStatsAggregator.pbtxt  |   4 -
 .../api_def_SetStatsAggregatorDataset.pbtxt   |   3 +
 .../api_def_IteratorSetStatsAggregator.pbtxt  |   4 -
 .../api_def_SetStatsAggregatorDataset.pbtxt   |   4 +
 .../data => framework}/stats_aggregator.h     |   6 +-
 tensorflow/core/kernels/data/BUILD            |  32 ++---
 tensorflow/core/kernels/data/iterator_ops.cc  |  32 +----
 .../data/stats_aggregator_dataset_op.cc       | 135 ++++++++++++++++++
 .../core/kernels/data/stats_aggregator_ops.cc |   2 +-
 .../core/kernels/data/stats_dataset_ops.cc    |   2 +-
 .../core/ops/compat/ops_history.v1.pbtxt      |  12 --
 tensorflow/core/ops/dataset_ops.cc            |  13 +-
 tensorflow/core/ops/ops.pbtxt                 |  12 --
 16 files changed, 240 insertions(+), 150 deletions(-)
 delete mode 100644 tensorflow/core/api_def/base_api/api_def_IteratorSetStatsAggregator.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt
 delete mode 100644 tensorflow/core/api_def/python_api/api_def_IteratorSetStatsAggregator.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt
 rename tensorflow/core/{kernels/data => framework}/stats_aggregator.h (94%)
 create mode 100644 tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc

diff --git a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
index 07bdf92044..7acbc676ce 100644
--- a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
@@ -50,17 +50,17 @@ class StatsDatasetTest(test.TestCase):
     self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto))
 
   def testBytesProduced(self):
+    stats_aggregator = stats_ops.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).map(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
-            stats_ops.bytes_produced_stats("bytes_produced"))
+            stats_ops.bytes_produced_stats("bytes_produced")).apply(
+                stats_ops.set_stats_aggregator(stats_aggregator))
     iterator = dataset.make_initializable_iterator()
-    stats_aggregator = stats_ops.StatsAggregator()
-    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
     next_element = iterator.get_next()
     summary_t = stats_aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run([iterator.initializer, stats_aggregator_subscriber])
+      sess.run(iterator.initializer)
       expected_sum = 0.0
       for i in range(100):
         self.assertAllEqual(
@@ -76,16 +76,16 @@ class StatsDatasetTest(test.TestCase):
       self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
 
   def testLatencyStats(self):
+    stats_aggregator = stats_ops.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency"))
+        stats_ops.latency_stats("record_latency")).apply(
+            stats_ops.set_stats_aggregator(stats_aggregator))
     iterator = dataset.make_initializable_iterator()
-    stats_aggregator = stats_ops.StatsAggregator()
-    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
     next_element = iterator.get_next()
     summary_t = stats_aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run([iterator.initializer, stats_aggregator_subscriber])
+      sess.run(iterator.initializer)
       for i in range(100):
         self.assertEqual(i, sess.run(next_element))
         self._assertSummaryHasCount(
@@ -95,16 +95,15 @@ class StatsDatasetTest(test.TestCase):
       self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 100.0)
 
   def testReinitialize(self):
+    stats_aggregator = stats_ops.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency"))
+        stats_ops.latency_stats("record_latency")).apply(
+            stats_ops.set_stats_aggregator(stats_aggregator))
     iterator = dataset.make_initializable_iterator()
-    stats_aggregator = stats_ops.StatsAggregator()
-    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
     next_element = iterator.get_next()
     summary_t = stats_aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run(stats_aggregator_subscriber)
       for j in range(5):
         sess.run(iterator.initializer)
         for i in range(100):
@@ -130,17 +129,17 @@ class StatsDatasetTest(test.TestCase):
         sess.run(next_element)
 
   def testMultipleTags(self):
+    stats_aggregator = stats_ops.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.latency_stats("record_latency_2"))
+            stats_ops.latency_stats("record_latency_2")).apply(
+                stats_ops.set_stats_aggregator(stats_aggregator))
     iterator = dataset.make_initializable_iterator()
-    stats_aggregator = stats_ops.StatsAggregator()
-    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
     next_element = iterator.get_next()
     summary_t = stats_aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run([iterator.initializer, stats_aggregator_subscriber])
+      sess.run(iterator.initializer)
       for i in range(100):
         self.assertEqual(i, sess.run(next_element))
         self._assertSummaryHasCount(
@@ -154,17 +153,17 @@ class StatsDatasetTest(test.TestCase):
           sess.run(summary_t), "record_latency_2", 100.0)
 
   def testRepeatedTags(self):
+    stats_aggregator = stats_ops.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.latency_stats("record_latency"))
+            stats_ops.latency_stats("record_latency")).apply(
+                stats_ops.set_stats_aggregator(stats_aggregator))
     iterator = dataset.make_initializable_iterator()
-    stats_aggregator = stats_ops.StatsAggregator()
-    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
     next_element = iterator.get_next()
     summary_t = stats_aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run([iterator.initializer, stats_aggregator_subscriber])
+      sess.run(iterator.initializer)
       for i in range(100):
         self.assertEqual(i, sess.run(next_element))
         self._assertSummaryHasCount(
@@ -174,19 +173,17 @@ class StatsDatasetTest(test.TestCase):
       self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
 
   def testMultipleIteratorsSameAggregator(self):
+    stats_aggregator = stats_ops.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency"))
+        stats_ops.latency_stats("record_latency")).apply(
+            stats_ops.set_stats_aggregator(stats_aggregator))
     iterator_0 = dataset.make_initializable_iterator()
     iterator_1 = dataset.make_initializable_iterator()
-    stats_aggregator = stats_ops.StatsAggregator()
-    stats_aggregator_subscribers = [stats_aggregator.subscribe(iterator_0),
-                                    stats_aggregator.subscribe(iterator_1)]
     next_element = iterator_0.get_next() + iterator_1.get_next()
     summary_t = stats_aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run([iterator_0.initializer, iterator_1.initializer,
-                stats_aggregator_subscribers])
+      sess.run([iterator_0.initializer, iterator_1.initializer])
       for i in range(100):
         self.assertEqual(i * 2, sess.run(next_element))
         self._assertSummaryHasCount(
@@ -195,20 +192,6 @@ class StatsDatasetTest(test.TestCase):
         sess.run(next_element)
       self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
 
-  def testMultipleStatsAggregatorsSameIteratorFail(self):
-    dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency"))
-    iterator = dataset.make_initializable_iterator()
-    stats_aggregator_0 = stats_ops.StatsAggregator()
-    stats_aggregator_1 = stats_ops.StatsAggregator()
-
-    with self.test_session() as sess:
-      sess.run(stats_aggregator_0.subscribe(iterator))
-      # TODO(mrry): Consider making this allowable (and also allowing
-      # aggregators to unsubscribe).
-      with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(stats_aggregator_1.subscribe(iterator))
-
 
 class StatsDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
@@ -253,5 +236,9 @@ class StatsDatasetSerializationTest(
         None, num_outputs)
 
 
+# TODO(shivaniagrawal): Can not checkpoint input_pipeline with the
+# transformation `stats_ops.set_stats_aggregator`, since we don't support
+# serializing StatsAggregator yet.
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/ops/stats_ops.py b/tensorflow/contrib/data/python/ops/stats_ops.py
index b5cf0fcfe9..d391720396 100644
--- a/tensorflow/contrib/data/python/ops/stats_ops.py
+++ b/tensorflow/contrib/data/python/ops/stats_ops.py
@@ -18,7 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
@@ -85,25 +84,53 @@ class StatsAggregator(object):
     """
     return gen_dataset_ops.stats_aggregator_summary(self._resource)
 
-  def subscribe(self, iterator):
-    """Returns a @{tf.Operation} to associate this aggregator with `iterator`.
 
-    Note: Each @{tf.data.Iterator} can be associated with at most one
-    `StatsAggregator`. After running the operation that this function
-    returns, all statistics recorded in the iteration of `iterator`
-    will be stored in `stats_aggregator`.
+class _SetStatsAggregatorDataset(dataset_ops.Dataset):
+  """A `Dataset` that acts as an identity, and sets given stats_aggregator."""
 
-    Args:
-      iterator: A @{tf.data.Iterator} object.
+  def __init__(self, input_dataset, stats_aggregator):
+    super(_SetStatsAggregatorDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._stats_aggregator = stats_aggregator
 
-    Returns:
-      A @{tf.Operation} that, when run, associates this aggregator with
-      `iterator`.
-    """
-    if not isinstance(iterator, iterator_ops.Iterator):
-      raise TypeError("`iterator` must be a `tf.data.Iterator` object.")
-    return gen_dataset_ops.iterator_set_stats_aggregator(
-        iterator._iterator_resource, self._resource)  # pylint: disable=protected-access
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.set_stats_aggregator_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._stats_aggregator._resource,  # pylint: disable=protected-access
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+
+# TODO(shivaniagrawal): Expose these methods in `tf.contrib.data`.
+def set_stats_aggregator(stats_aggregator):
+  """Set the given stats_aggregator for aggregating the input dataset stats.
+
+  Args:
+    stats_aggregator: A `StatsAggregator` object.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}.
+  """
+
+  def _apply_fn(dataset):
+    return _SetStatsAggregatorDataset(dataset, stats_aggregator)
+
+  return _apply_fn
 
 
 def bytes_produced_stats(tag):
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 21f929894c..54e7ab31d7 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -547,6 +547,7 @@ tf_cuda_library(
         "framework/selective_registration.h",
         "framework/session_state.h",
         "framework/shape_inference.h",
+        "framework/stats_aggregator.h",
         "framework/tensor.h",
         "framework/tensor_shape.h",
         "framework/tensor_slice.h",
diff --git a/tensorflow/core/api_def/base_api/api_def_IteratorSetStatsAggregator.pbtxt b/tensorflow/core/api_def/base_api/api_def_IteratorSetStatsAggregator.pbtxt
deleted file mode 100644
index c6f2212cd4..0000000000
--- a/tensorflow/core/api_def/base_api/api_def_IteratorSetStatsAggregator.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "IteratorSetStatsAggregator"
-  summary: "Associates the given iterator with the given statistics aggregator."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt
new file mode 100644
index 0000000000..77123e143b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "SetStatsAggregatorDataset"
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_IteratorSetStatsAggregator.pbtxt b/tensorflow/core/api_def/python_api/api_def_IteratorSetStatsAggregator.pbtxt
deleted file mode 100644
index db51ae3873..0000000000
--- a/tensorflow/core/api_def/python_api/api_def_IteratorSetStatsAggregator.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "IteratorSetStatsAggregator"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt
new file mode 100644
index 0000000000..3a8c1036ca
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SetStatsAggregatorDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/kernels/data/stats_aggregator.h b/tensorflow/core/framework/stats_aggregator.h
similarity index 94%
rename from tensorflow/core/kernels/data/stats_aggregator.h
rename to tensorflow/core/framework/stats_aggregator.h
index 076a56b0bf..a449f324e6 100644
--- a/tensorflow/core/kernels/data/stats_aggregator.h
+++ b/tensorflow/core/framework/stats_aggregator.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_STATS_AGGREGATOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_STATS_AGGREGATOR_H_
 
 #include <memory>
 #include <string>
@@ -81,4 +81,4 @@ class StatsAggregatorResource : public ResourceBase {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_STATS_AGGREGATOR_H_
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index e856ede44b..221724e25d 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -13,20 +13,10 @@ load(
     "tf_cc_test",
 )
 
-cc_library(
-    name = "stats_aggregator",
-    hdrs = ["stats_aggregator.h"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-    ],
-)
-
 tf_kernel_library(
     name = "stats_aggregator_ops",
     srcs = ["stats_aggregator_ops.cc"],
     deps = [
-        ":stats_aggregator",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -38,14 +28,7 @@ cc_library(
     name = "dataset",
     srcs = [],
     hdrs = ["dataset.h"],
-    deps = [
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 cc_library(
@@ -360,7 +343,6 @@ tf_kernel_library(
     srcs = ["stats_dataset_ops.cc"],
     deps = [
         ":dataset",
-        ":stats_aggregator",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -368,6 +350,16 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "stats_aggregator_dataset_op",
+    srcs = ["stats_aggregator_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
 tf_kernel_library(
     name = "random_dataset_op",
     srcs = ["random_dataset_op.cc"],
@@ -510,7 +502,6 @@ tf_kernel_library(
     srcs = ["iterator_ops.cc"],
     deps = [
         ":dataset",
-        ":stats_aggregator",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -564,6 +555,7 @@ tf_kernel_library(
         ":slide_dataset_op",
         ":sparse_tensor_slice_dataset_op",
         ":sql_dataset_ops",
+        ":stats_aggregator_dataset_op",
         ":stats_aggregator_ops",
         ":stats_dataset_ops",
         ":take_dataset_op",
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 780f927a4f..4e4997d7b3 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -19,11 +19,11 @@ limitations under the License.
 #include "tensorflow/core/framework/iterator.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/kernels/data/dataset.h"
-#include "tensorflow/core/kernels/data/stats_aggregator.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -203,10 +203,6 @@ class IteratorResource : public ResourceBase {
     return Status::OK();
   }
 
-  void set_stats_aggregator(std::shared_ptr<StatsAggregator> stats_aggregator) {
-    mutex_lock l(mu_);
-    stats_aggregator_ = std::move(stats_aggregator);
-  }
 
   std::shared_ptr<StatsAggregator> stats_aggregator() {
     tf_shared_lock l(mu_);
@@ -1075,30 +1071,6 @@ class DeserializeIteratorOp : public OpKernel {
   }
 };
 
-class IteratorSetStatsAggregatorOp : public OpKernel {
- public:
-  explicit IteratorSetStatsAggregatorOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    IteratorResource* iterator_resource;
-    OP_REQUIRES_OK(
-        ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator_resource));
-    core::ScopedUnref unref_iterator(iterator_resource);
-
-    StatsAggregatorResource* stats_aggregator_resource;
-    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 1),
-                                       &stats_aggregator_resource));
-    core::ScopedUnref unref_stats_aggregator(stats_aggregator_resource);
-    // TODO(mrry): Consider allowing multiple StatsAggregator ops to
-    // subscribe to updates, and/or unsubscribing.
-    OP_REQUIRES(ctx, !iterator_resource->stats_aggregator(),
-                errors::FailedPrecondition(
-                    "Iterator already associated with a StatsAggregator"));
-    iterator_resource->set_stats_aggregator(
-        stats_aggregator_resource->stats_aggregator());
-  }
-};
 
 REGISTER_KERNEL_BUILDER(Name("Iterator").Device(DEVICE_CPU), IteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("MakeIterator").Device(DEVICE_CPU),
@@ -1119,8 +1091,6 @@ REGISTER_KERNEL_BUILDER(Name("SerializeIterator").Device(DEVICE_CPU),
                         SerializeIteratorOp);
 REGISTER_KERNEL_BUILDER(Name("DeserializeIterator").Device(DEVICE_CPU),
                         DeserializeIteratorOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorSetStatsAggregator").Device(DEVICE_CPU),
-                        IteratorSetStatsAggregatorOp);
 
 }  // namespace
 
diff --git a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
new file mode 100644
index 0000000000..eb96b8a872
--- /dev/null
+++ b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
@@ -0,0 +1,135 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/lib/random/random.h"
+
+namespace tensorflow {
+namespace {
+
+class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit SetStatsAggregatorDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    StatsAggregatorResource* stats_aggregator_resource;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 1),
+                                       &stats_aggregator_resource));
+    core::ScopedUnref unref_stats_aggregator(stats_aggregator_resource);
+
+    *output = new Dataset(ctx, input, stats_aggregator_resource);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input,
+                     StatsAggregatorResource* stats_aggregator_resource)
+        : GraphDatasetBase(ctx),
+          input_(input),
+          stats_aggregator_resource_(stats_aggregator_resource) {
+      input_->Ref();
+      stats_aggregator_resource_->Ref();
+    }
+
+    ~Dataset() override {
+      input_->Unref();
+      stats_aggregator_resource_->Unref();
+    }
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::SetStatsAggregator")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override {
+      return "SetStatsAggregatorDatasetOp::Dataset";
+    }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      return errors::Unimplemented(
+          "Cannot currently serialize the `stats_aggregator` for a "
+          "SetStatsAggregatorDataset.");
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        StatsAggregatorResource* stats_aggregator_resource =
+            dataset()->stats_aggregator_resource_;
+        IteratorContext::Params params;
+        params.env = ctx->env();
+        params.runner = *(ctx->runner());
+        params.stats_aggregator_getter = [stats_aggregator_resource]() {
+          return stats_aggregator_resource->stats_aggregator();
+        };
+        params.lib = ctx->lib();
+        params.function_library = ctx->function_library();
+        params.allocator_getter = ctx->allocator_getter();
+        IteratorContext set_stats_aggregator_ctx(params);
+        return input_impl_->GetNext(&set_stats_aggregator_ctx, out_tensors,
+                                    end_of_sequence);
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+    StatsAggregatorResource* stats_aggregator_resource_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("SetStatsAggregatorDataset").Device(DEVICE_CPU),
+                        SetStatsAggregatorDatasetOp);
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/stats_aggregator_ops.cc
index 17103627e0..dd37311580 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/stats_aggregator_ops.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/data/stats_aggregator.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
 
 #include <memory>
 
diff --git a/tensorflow/core/kernels/data/stats_dataset_ops.cc b/tensorflow/core/kernels/data/stats_dataset_ops.cc
index 4dc1343e21..633cd85451 100644
--- a/tensorflow/core/kernels/data/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/stats_dataset_ops.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/dataset.h"
-#include "tensorflow/core/kernels/data/stats_aggregator.h"
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 5bd37efac8..031932d79f 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -25657,18 +25657,6 @@ op {
   }
   is_stateful: true
 }
-op {
-  name: "IteratorSetStatsAggregator"
-  input_arg {
-    name: "iterator_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "stats_aggregator_handle"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
 op {
   name: "IteratorToStringHandle"
   input_arg {
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index b25abbcc67..57f871af32 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -151,6 +151,14 @@ REGISTER_OP("LatencyStatsDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("SetStatsAggregatorDataset")
+    .Input("input_dataset: variant")
+    .Input("stats_aggregator: resource")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("MapDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -506,11 +514,6 @@ REGISTER_OP("StatsAggregatorHandle")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''");
 
-REGISTER_OP("IteratorSetStatsAggregator")
-    .Input("iterator_handle: resource")
-    .Input("stats_aggregator_handle: resource")
-    .SetShapeFn(shape_inference::NoOutputs);
-
 REGISTER_OP("StatsAggregatorSummary")
     .Input("iterator: resource")
     .Output("summary: string")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index a36608ded3..4ae1c3d7e0 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -12364,18 +12364,6 @@ op {
   }
   is_stateful: true
 }
-op {
-  name: "IteratorSetStatsAggregator"
-  input_arg {
-    name: "iterator_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "stats_aggregator_handle"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
 op {
   name: "IteratorToStringHandle"
   input_arg {
-- 
GitLab


From fddfa9f8dcd1a922ade5362c0538ca39e99472a7 Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou <yuefengz@google.com>
Date: Wed, 18 Apr 2018 16:35:44 -0700
Subject: [PATCH 108/450] Change distribution.distribute_dataset to accept an
 input_fn instead of a dataset.

PiperOrigin-RevId: 193437651
---
 .../distribute/python/minimize_loss_test.py   | 31 +++++++++--------
 .../distribute/python/mirrored_strategy.py    |  5 +--
 .../python/mirrored_strategy_multigpu_test.py |  4 +--
 .../distribute/python/one_device_strategy.py  |  4 +--
 .../distribute/python/optimizer_v2_test.py    |  4 +--
 .../distribute/python/single_loss_example.py  | 33 ++++++++++++-------
 .../contrib/distribute/python/step_fn.py      | 14 ++++----
 tensorflow/python/BUILD                       |  1 +
 tensorflow/python/estimator/estimator.py      | 21 +++++-------
 tensorflow/python/training/distribute.py      | 21 +++++++++---
 10 files changed, 79 insertions(+), 59 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index d7fbf7f379..6c73250ded 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -54,21 +54,18 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
   def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss,
                        is_tpu):
     with distribution.scope():
-      model_fn, dataset, layer = minimize_loss_example(
-          optimizer_fn,
-          use_bias=True,
-          use_callable_loss=use_callable_loss)
+      model_fn, dataset_fn, layer = minimize_loss_example(
+          optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
 
+      def tpu_dataset_fn():
+        return dataset_fn().batch(2)
       # TODO(isaprykin):  Eliminate `is_tpu`. Probably add a
       # `DistributionStrategy.create_monitor` so that each DistributionStrategy
       # could influence its training loop. That method would return an instance
       # of Monitor.  TPUMonitor would execute tpu.initialize_system() and
       # tpu.shutdown_system().
-      if is_tpu:
-        dataset = dataset.batch(2)
-
       iterator = distribution.distribute_dataset(
-          dataset).make_one_shot_iterator()
+          tpu_dataset_fn if is_tpu else dataset_fn).make_one_shot_iterator()
 
       def run_step():
         # TODO(isaprykin): Make iterator get_next() return a list of sub-
@@ -122,14 +119,14 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
     # `distribution.scope`.
     with variable_scope.variable_creator_scope(
         appending_creator), distribution.scope():
-      model_fn, dataset, layer = minimize_loss_example(
+      model_fn, dataset_fn, layer = minimize_loss_example(
           optimizer_fn,
           use_bias=True,
           use_callable_loss=True,
           create_optimizer_inside_model_fn=True)
 
       iterator = distribution.distribute_dataset(
-          dataset).make_one_shot_iterator()
+          dataset_fn).make_one_shot_iterator()
 
       def run_step():
         return distribution.group(
@@ -176,7 +173,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
     """Verifies that moving mean updates are reduced across towers."""
     with distribution.scope():
       num_towers = len(distribution.worker_devices)
-      model_fn, dataset, batchnorm = batchnorm_example(
+      model_fn, dataset_fn, batchnorm = batchnorm_example(
           optimizer_fn,
           batch_per_epoch=num_towers,
           momentum=momentum,
@@ -188,7 +185,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       if isinstance(distribution, mirrored_strategy.MirroredStrategy):
         distribution._prefetch_on_device = False
       iterator = distribution.distribute_dataset(
-          dataset).make_one_shot_iterator()
+          dataset_fn).make_one_shot_iterator()
 
       def run_step():
         return control_flow_ops.group(
@@ -260,11 +257,13 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         else:
           return optimizer.minimize(loss_fn())
 
-      features = dataset_ops.Dataset.from_tensors([[2.], [7.]])
-      labels = dataset_ops.Dataset.from_tensors([[6.], [21.]])
-      dataset = dataset_ops.Dataset.zip((features, labels)).repeat()
+      def dataset_fn():
+        features = dataset_ops.Dataset.from_tensors([[2.], [7.]])
+        labels = dataset_ops.Dataset.from_tensors([[6.], [21.]])
+        return dataset_ops.Dataset.zip((features, labels)).repeat()
+
       iterator = distribution.distribute_dataset(
-          dataset).make_one_shot_iterator()
+          dataset_fn).make_one_shot_iterator()
 
       def run_step():
         return distribution.group(
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index d5e22e8100..6efd578a77 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -140,9 +140,10 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
       g.add_to_collections(collections, result)
     return result
 
-  def distribute_dataset(self, dataset):
+  def distribute_dataset(self, dataset_fn):
     return values.PerDeviceDataset(
-        dataset, self._devices, self._prefetch_on_device)
+        self._call_dataset_fn(dataset_fn), self._devices,
+        self._prefetch_on_device)
 
   def _broadcast(self, tensor, destinations):
     # TODO(josh11b): In eager mode, use one thread per device, or async mode.
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index 59cd6703b9..6c5c055070 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -247,9 +247,9 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
     dist = mirrored_strategy.MirroredStrategy(
         ["/device:GPU:0", "/device:CPU:0"])
-    features = dataset_ops.Dataset.from_tensors([[1.]]).repeat(10)
     features = dist.distribute_dataset(
-        features).make_one_shot_iterator().get_next()
+        lambda: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10)
+    ).make_one_shot_iterator().get_next()
 
     with dist.scope():
       result = dist.call_for_each_tower(
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
index 2002266dd5..646d2a5c3b 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -60,8 +60,8 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
     with ops.colocate_with(colocate_with):
       return next_creator(*args, **kwargs)
 
-  def distribute_dataset(self, dataset):
-    return dataset
+  def distribute_dataset(self, dataset_fn):
+    return self._call_dataset_fn(dataset_fn)
 
   def _broadcast(self, tensor, destinations):
     return tensor
diff --git a/tensorflow/contrib/distribute/python/optimizer_v2_test.py b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
index 6e4d050073..abd3a65ac4 100644
--- a/tensorflow/contrib/distribute/python/optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
@@ -39,11 +39,11 @@ class MinimizeLossOptimizerV2Test(test.TestCase, parameterized.TestCase):
   def testTrainNetwork(self, distribution, optimizer_fn,
                        use_callable_loss=True):
     with distribution.scope():
-      model_fn, dataset, layer = minimize_loss_example(
+      model_fn, dataset_fn, layer = minimize_loss_example(
           optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
 
       iterator = distribution.distribute_dataset(
-          dataset).make_one_shot_iterator()
+          dataset_fn).make_one_shot_iterator()
 
       def run_step():
         return control_flow_ops.group(distribution.unwrap(
diff --git a/tensorflow/contrib/distribute/python/single_loss_example.py b/tensorflow/contrib/distribute/python/single_loss_example.py
index cef5fd2f89..9e8f919c8a 100644
--- a/tensorflow/contrib/distribute/python/single_loss_example.py
+++ b/tensorflow/contrib/distribute/python/single_loss_example.py
@@ -29,7 +29,10 @@ from tensorflow.python.ops import math_ops
 
 def single_loss_example(optimizer_fn, distribution, use_bias=False):
   """Build a very simple network to use in tests and examples."""
-  dataset = dataset_ops.Dataset.from_tensors([[1.]]).repeat()
+
+  def dataset_fn():
+    return dataset_ops.Dataset.from_tensors([[1.]]).repeat()
+
   optimizer = optimizer_fn()
   layer = core.Dense(1, use_bias=use_bias)
 
@@ -37,8 +40,8 @@ def single_loss_example(optimizer_fn, distribution, use_bias=False):
     y = array_ops.reshape(layer(x), []) - constant_op.constant(1.)
     return y * y
 
-  single_loss_step = step_fn.StandardSingleLossStep(dataset, loss_fn, optimizer,
-                                                    distribution)
+  single_loss_step = step_fn.StandardSingleLossStep(dataset_fn, loss_fn,
+                                                    optimizer, distribution)
 
   # Layer is returned for inspecting the kernels in tests.
   return single_loss_step, layer
@@ -49,7 +52,10 @@ def minimize_loss_example(optimizer_fn,
                           use_callable_loss=True,
                           create_optimizer_inside_model_fn=False):
   """Example of non-distribution-aware legacy code."""
-  dataset = dataset_ops.Dataset.from_tensors([[1.]]).repeat()
+
+  def dataset_fn():
+    return dataset_ops.Dataset.from_tensors([[1.]]).repeat()
+
   # An Optimizer instance is created either outside or inside model_fn.
   outer_optimizer = None
   if not create_optimizer_inside_model_fn:
@@ -71,7 +77,7 @@ def minimize_loss_example(optimizer_fn,
     else:
       return optimizer.minimize(loss_fn())
 
-  return model_fn, dataset, layer
+  return model_fn, dataset_fn, layer
 
 
 def batchnorm_example(optimizer_fn,
@@ -79,12 +85,15 @@ def batchnorm_example(optimizer_fn,
                       momentum=0.9,
                       renorm=False):
   """Example of non-distribution-aware legacy code with batch normalization."""
-  # input shape is [16, 8], input values are increasing in both dimensions.
-  dataset = dataset_ops.Dataset.from_tensor_slices(
-      [[[float(x * 8 + y + z * 100)
-         for y in range(8)]
-        for x in range(16)]
-       for z in range(batch_per_epoch)]).repeat()
+
+  def dataset_fn():
+    # input shape is [16, 8], input values are increasing in both dimensions.
+    return dataset_ops.Dataset.from_tensor_slices(
+        [[[float(x * 8 + y + z * 100)
+           for y in range(8)]
+          for x in range(16)]
+         for z in range(batch_per_epoch)]).repeat()
+
   optimizer = optimizer_fn()
   batchnorm = normalization.BatchNormalization(
       renorm=renorm, momentum=momentum, fused=False)
@@ -99,4 +108,4 @@ def batchnorm_example(optimizer_fn,
     # Callable loss.
     return optimizer.minimize(loss_fn)
 
-  return model_fn, dataset, batchnorm
+  return model_fn, dataset_fn, batchnorm
diff --git a/tensorflow/contrib/distribute/python/step_fn.py b/tensorflow/contrib/distribute/python/step_fn.py
index 68b8f4d626..d1910622b3 100644
--- a/tensorflow/contrib/distribute/python/step_fn.py
+++ b/tensorflow/contrib/distribute/python/step_fn.py
@@ -49,13 +49,14 @@ class StandardInputStep(Step):
   """Step with a standard implementation of input handling.
 
   Args:
-    input_dataset: a tf.data Dataset that provides input.
+    dataset_fn: a function that returns a tf.data Dataset that produces the
+      input for the model.
   """
 
-  def __init__(self, input_dataset, distribution):
+  def __init__(self, dataset_fn, distribution):
     Step.__init__(self, distribution)
     self._distributed_input = distribution.distribute_dataset(
-        input_dataset).make_one_shot_iterator()
+        dataset_fn).make_one_shot_iterator()
 
   def inputs(self):
     return self._distributed_input.get_next()
@@ -77,14 +78,15 @@ class StandardSingleLossStep(StandardInputStep):
   ```
 
   Args:
-    input_dataset: a tf.data Dataset that provides input.
+    dataset_fn: a function that returns a tf.data Dataset that produces the
+      input for the model.
     loss_fn: a function that returns loss.
     optimizer: an optimizer that implements an update rule.
     distribution: a `DistributionStrategy` object.
   """
 
-  def __init__(self, input_dataset, loss_fn, optimizer, distribution):
-    StandardInputStep.__init__(self, input_dataset, distribution)
+  def __init__(self, dataset_fn, loss_fn, optimizer, distribution):
+    StandardInputStep.__init__(self, dataset_fn, distribution)
     self._loss_fn = loss_fn
     self._optimizer = optimizer
     self._is_run_concurrently = False
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index c2bedab4f9..698e2a28bf 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -3048,6 +3048,7 @@ py_library(
         ":state_ops",
         ":util",
         ":variable_scope",
+        "//tensorflow/python/data",
         "//tensorflow/python/ops/losses",
     ],
 )
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index dde463aaf4..a42b6cfee8 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -688,22 +688,19 @@ class Estimator(object):
 
   def _get_features_and_labels_from_input_fn(self, input_fn, mode):
     """Extracts the `features` and labels from return values of `input_fn`."""
-    result = self._call_input_fn(input_fn, mode)
-    # TODO(anjalisridhar): What about the default DistributionStrategy? Perhaps
-    # using any input is alright in that case. There is also a
-    # has_dataset_or_queue_runner function that we may want to extend and use.
-    if (self._distribution is not None and
-        not isinstance(result, dataset_ops.Dataset) and
-        mode == model_fn_lib.ModeKeys.TRAIN):
-      raise ValueError('input_fn() must return a tf.data.Dataset when using a '
-                       'DistributionStrategy.')
     input_hooks = []
-    if isinstance(result, dataset_ops.Dataset):
-      if self._distribution is not None and mode == model_fn_lib.ModeKeys.TRAIN:
-        result = self._distribution.distribute_dataset(result)
+    if self._distribution is not None and mode == model_fn_lib.ModeKeys.TRAIN:
+      result = self._distribution.distribute_dataset(
+          lambda: self._call_input_fn(input_fn, mode))
       iterator = result.make_initializable_iterator()
       input_hooks.append(_DatasetInitializerHook(iterator))
       result = iterator.get_next()
+    else:
+      result = self._call_input_fn(input_fn, mode)
+      if isinstance(result, dataset_ops.Dataset):
+        iterator = result.make_initializable_iterator()
+        input_hooks.append(_DatasetInitializerHook(iterator))
+        result = iterator.get_next()
     if isinstance(result, (list, tuple)):
       if len(result) != 2:
         raise ValueError(
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index d855c4f551..21ec5292ad 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import threading
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -672,25 +673,35 @@ class DistributionStrategy(object):
     _require_distribution_strategy_scope(self)
     return variable_scope.variable_creator_scope(create_colocated_variable)
 
+  def _call_dataset_fn(self, dataset_fn):
+    result = dataset_fn()
+    if not isinstance(result, dataset_ops.Dataset):
+      raise ValueError(
+          "dataset_fn() must return a tf.data.Dataset when using a "
+          "DistributionStrategy.")
+    return result
+
   # TODO(josh11b): `PerDeviceDataset` currently only implements a few methods of
   # Dataset API such as make_one_shot_iterator and make_initializable_iterator.
   # Extend to implement more functionality of datasets.
-  def distribute_dataset(self, dataset):
+  def distribute_dataset(self, dataset_fn):
     """Return a `dataset` split across all towers.
 
     Suitable for providing input to for `call_for_each_tower()` by creating an
     iterator:
 
     ```
+    def dataset_fn():
+      return tf.data.Dataset.from_tensors([[1.]]).repeat()
     with distribution_strategy.scope():
-      distributed_dataset = distribution_strategy.distribute_dataset(dataset)
+      distributed_dataset = distribution_strategy.distribute_dataset(dataset_fn)
       iterator = distributed_dataset.make_one_shot_iterator()
       tower_results = distribution_strategy.call_for_each_tower(
           tower_fn, iterator.get_next())
     ```
 
     Args:
-      dataset: A `tf.data.Dataset`.
+      dataset_fn: A function that returns a `tf.data.Dataset`.
 
     Returns:
       A `PerDeviceDataset` that will produce data for each tower.
@@ -1135,8 +1146,8 @@ class _DefaultDistributionStrategy(DistributionStrategy):
     _require_distribution_strategy_scope(self)
     return ops.colocate_with(colocate_with_variable)
 
-  def distribute_dataset(self, dataset):
-    return dataset
+  def distribute_dataset(self, dataset_fn):
+    return self._call_dataset_fn(dataset_fn)
 
   def _broadcast(self, tensor, destinations):
     if destinations is None:
-- 
GitLab


From 5ec3b021fd7e509a1597880ff093802de1f63d42 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Wed, 18 Apr 2018 16:48:17 -0700
Subject: [PATCH 109/450] Add tf.train.Checkpoint for reading and writing
 object-based checkpoints.

Previously exposed as tf.contrib.eager.Checkpoint / tfe.Checkpoint.

Spiffies up the documentation a bit, but otherwise just adds the export decorator.

Compatible in both directions with tf.train.Saver (object-based checkpoints can be fed to tf.train.Saver, and name-based checkpoints can be fed to tf.train.Checkpoint).

PiperOrigin-RevId: 193439442
---
 .../python/training/checkpointable_utils.py   | 189 ++++++++++++++++--
 tensorflow/python/training/saver.py           |   4 +-
 tensorflow/python/training/training.py        |   1 +
 .../golden/tensorflow.train.-checkpoint.pbtxt |  23 +++
 .../tools/api/golden/tensorflow.train.pbtxt   |   4 +
 5 files changed, 201 insertions(+), 20 deletions(-)
 create mode 100644 tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt

diff --git a/tensorflow/python/training/checkpointable_utils.py b/tensorflow/python/training/checkpointable_utils.py
index 2c4677a278..4769e15120 100644
--- a/tensorflow/python/training/checkpointable_utils.py
+++ b/tensorflow/python/training/checkpointable_utils.py
@@ -38,6 +38,7 @@ from tensorflow.python.training import checkpointable as checkpointable_lib
 from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 
 
 _ESCAPE_CHAR = "."  # For avoiding conflicts with user-specified names.
@@ -822,30 +823,92 @@ class CheckpointableSaver(object):
     return load_status
 
 
+@tf_export("train.Checkpoint")
 class Checkpoint(checkpointable_lib.Checkpointable):
-  """A utility class which groups `Checkpointable` objects.
+  """Groups checkpointable objects, saving and restoring them.
 
-  Accepts arbitrary keyword arguments to its constructor and saves those values
-  with a checkpoint. Maintains a `save_counter` for numbering checkpoints.
+  `Checkpoint`'s constructor accepts keyword arguments whose values are types
+  that contain checkpointable state, such as `tf.train.Optimizer`
+  implementations, `tf.Variable`, `tf.keras.Layer` implementations, or
+  `tf.keras.Model` implementations. It saves these values with a checkpoint, and
+  maintains a `save_counter` for numbering checkpoints.
 
-  Example usage:
+  Example usage when graph building:
 
   ```python
   import tensorflow as tf
-  import tensorflow.contrib.eager as tfe
   import os
 
   checkpoint_directory = "/tmp/training_checkpoints"
   checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
 
-  root = tfe.Checkpoint(optimizer=optimizer, model=model)
-  root.restore(tf.train.latest_checkpoint(checkpoint_directory))
+  checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
+  status = checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))
+  train_op = optimizer.minimize( ... )
+  status.assert_consumed()  # Optional sanity checks.
+  with tf.Session() as session:
+    # Use the Session to restore variables, or initialize them if
+    # tf.train.latest_checkpoint returned None.
+    status.initialize_or_restore(session)
+    for _ in range(num_training_steps):
+      session.run(train_op)
+    checkpoint.save(file_prefix=checkpoint_prefix)
+  ```
+
+  Example usage with eager execution enabled:
+
+  ```python
+  import tensorflow as tf
+  import os
+
+  tf.enable_eager_execution()
+
+  checkpoint_directory = "/tmp/training_checkpoints"
+  checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+  checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
+  status = checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))
   for _ in range(num_training_steps):
-    optimizer.minimize( ... )
-  root.save(file_prefix=checkpoint_prefix)
+    optimizer.minimize( ... )  # Variables will be restored on creation.
+  status.assert_consumed()  # Optional sanity checks.
+  checkpoint.save(file_prefix=checkpoint_prefix)
+  ```
+
+  `Checkpoint.save` and `Checkpoint.restore` write and read object-based
+  checkpoints, in contrast to `tf.train.Saver` which writes and reads
+  `variable.name` based checkpoints. Object-based checkpointing saves a graph of
+  dependencies between Python objects (`Layer`s, `Optimizer`s, `Variable`s,
+  etc.) with named edges, and this graph is used to match variables when
+  restoring a checkpoint. It can be more robust to changes in the Python
+  program, and helps to support restore-on-create for variables when executing
+  eagerly. Prefer `tf.train.Checkpoint` over `tf.train.Saver` for new code.
+
+  `Checkpoint` objects have dependencies on the objects passed as keyword
+  arguments to their constructors, and each dependency is given a name that is
+  identical to the name of the keyword argument for which it was created.
+  TensorFlow classes like `Layer`s and `Optimizer`s will automatically add
+  dependencies on their variables (e.g. "kernel" and "bias" for
+  `tf.keras.layers.Dense`). Inheriting from `tf.keras.Model` makes managing
+  dependencies easy in user-defined classes, since `Model` hooks into attribute
+  assignment. For example:
+
+  ```python
+  class Regress(tf.keras.Model):
+
+    def __init__(self):
+      super(Regress, self).__init__()
+      self.input_transform = tf.keras.layers.Dense(10)
+      # ...
+
+    def call(self, inputs):
+      x = self.input_transform(inputs)
+      # ...
   ```
 
-  For more manual control over saving, use `tfe.CheckpointableSaver` directly.
+  This `Model` has a dependency named "input_transform" on its `Dense` layer,
+  which in turn depends on its variables. As a result, saving an instance of
+  `Regress` using `tf.train.Checkpoint` will also save all the variables created
+  by the `Dense` layer.
 
   Attributes:
     save_counter: Incremented when `save()` is called. Used to number
@@ -857,17 +920,19 @@ class Checkpoint(checkpointable_lib.Checkpointable):
 
     Args:
       **kwargs: Keyword arguments are set as attributes of this object, and are
-        saved with the checkpoint. Attribute values must derive from
-        `CheckpointableBase`.
+        saved with the checkpoint. Values must be checkpointable objects.
     Raises:
-      ValueError: If objects in `kwargs` are not Checkpointable.
+      ValueError: If objects in `kwargs` are not checkpointable.
     """
     super(Checkpoint, self).__init__()
     for k, v in sorted(kwargs.items(), key=lambda item: item[0]):
       if not isinstance(v, checkpointable_lib.CheckpointableBase):
         raise ValueError(
-            ("`Checkpoint` was expecting an object derived from "
-             "`CheckpointableBase`, got %s.") % (v,))
+            ("`Checkpoint` was expecting a checkpointable object (an object "
+             "derived from `CheckpointableBase`), got %s. If you believe this "
+             "object should be checkpointable (i.e. it is part of the "
+             "TensorFlow Python API and manages state), please open an issue.")
+            % (v,))
       setattr(self, k, v)
     self._save_counter = None  # Created lazily for restore-on-create.
     self._saver = CheckpointableSaver(weakref.ref(self))
@@ -893,7 +958,23 @@ class Checkpoint(checkpointable_lib.Checkpointable):
     return self._save_counter
 
   def save(self, file_prefix, session=None):
-    """Save a checkpoint. Wraps `tfe.CheckpointableSaver.save`."""
+    """Save a training checkpoint.
+
+    The saved checkpoint includes variables created by this object and any
+    checkpointable objects it depends on at the time `Checkpoint.save()` is
+    called.
+
+    Args:
+      file_prefix: A prefix to use for the checkpoint filenames
+        (/path/to/directory/and_a_prefix). Names are generated based on this
+        prefix and `Checkpoint.save_counter`.
+      session: The session to evaluate variables in. Ignored when executing
+        eagerly. If not provided when graph building, the default session is
+        used.
+
+    Returns:
+      The full path to the checkpoint.
+    """
     in_graph_mode = not context.executing_eagerly()
     if in_graph_mode:
       if session is None:
@@ -913,7 +994,81 @@ class Checkpoint(checkpointable_lib.Checkpointable):
         session=session)
 
   def restore(self, save_path):
-    """Restore a checkpoint. Wraps `tfe.CheckpointableSaver.restore`."""
+    """Restore a training checkpoint.
+
+    Restores this `Checkpoint` and any objects it depends on.
+
+    When executing eagerly, either assigns values immediately if variables to
+    restore have been created already, or defers restoration until the variables
+    are created. Dependencies added after this call will be matched if they have
+    a corresponding object in the checkpoint (the restore request will queue in
+    any checkpointable object waiting for the expected dependency to be added).
+
+    When graph building, restoration ops are added to the graph but not run
+    immediately.
+
+    To ensure that loading is complete and no more assignments will take place,
+    use the `assert_consumed()` method of the status object returned by
+    `restore`:
+
+    ```python
+    checkpoint = tf.train.Checkpoint( ... )
+    checkpoint.restore(path).assert_consumed()
+    ```
+
+    An exception will be raised if any Python objects in the dependency graph
+    were not found in the checkpoint, or if any checkpointed values do not have
+    a matching Python object.
+
+    When graph building, `assert_consumed()` indicates that all of the restore
+    ops that will be created for this checkpoint have been created. They can be
+    run via the `run_restore_ops()` method of the status object:
+
+    ```python
+    checkpoint.restore(path).assert_consumed().run_restore_ops()
+    ```
+
+    If the checkpoint has not been consumed completely, then the list of restore
+    ops will grow as more objects are added to the dependency graph.
+
+    Name-based `tf.train.Saver` checkpoints can be loaded using this
+    method. There is no deferred loading, and names are used to match
+    variables. No restore ops are created/run until `run_restore_ops()` or
+    `initialize_or_restore()` are called on the returned status object, even
+    when executing eagerly. Re-encode name-based checkpoints using
+    `tf.train.Checkpoint.save` as soon as possible.
+
+    Args:
+      save_path: The path to the checkpoint, as returned by `save` or
+        `tf.train.latest_checkpoint`. If None (as when there is no latest
+        checkpoint for `tf.train.latest_checkpoint` to return), returns an
+        object which may run initializers for objects in the dependency
+        graph. If the checkpoint was written by the name-based `tf.train.Saver`,
+        names are used to match variables.
+
+    Returns:
+      A load status object, which can be used to make assertions about the
+      status of a checkpoint restoration and run initialization/restore ops.
+
+      The returned status object has the following methods:
+      - `assert_consumed()`:
+          Raises an exception if any variables/objects are unmatched: either
+          checkpointed values which don't have a matching Python object or
+          Python objects in the dependency graph with no values in the
+          checkpoint. This method returns the status object, and so may be
+          chained with `initialize_or_restore` or `run_restore_ops`.
+      - `initialize_or_restore(session=None)`:
+          When graph building, runs variable initializers if `save_path` is
+          `None`, but otherwise runs restore operations. If no `session` is
+          explicitly specified, the default session is used. No effect for
+          object-based checkpoints when executing eagerly (variables are
+          initialized or restored eagerly).
+      - `run_restore_ops(session=None)`:
+          When graph building, runs restore operations. If no `session` is
+          explicitly specified, the default session is used. No effect for
+          object-based checkpoints when executing eagerly (restore operations
+          are run eagerly). May only be called when `save_path` is not `None`.
+    """
     status = self._saver.restore(save_path=save_path)
     # Create the save counter now so it gets initialized with other variables
     # when graph building. Creating it earlier would lead to double
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 79d278cf90..a74d629a8f 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -1824,12 +1824,10 @@ class Saver(object):
       # This is an object-based checkpoint. We'll print a warning and then do
       # the restore.
       logging.warning(
-          # TODO(allenl): Modify instructions for using the object-based saver
-          # once that's in core.
           "Restoring an object-based checkpoint using a name-based saver. This "
           "may be somewhat fragile, and will re-build the Saver. Instead, "
           "consider loading object-based checkpoints using "
-          "tf.contrib.eager.Checkpoint().")
+          "tf.train.Checkpoint().")
       self._restore_from_object_based_checkpoint(
           sess=sess, save_path=save_path,
           object_graph_string=object_graph_string)
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index b759b156d7..d7e5078be7 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -156,6 +156,7 @@ from tensorflow.python.training.basic_session_run_hooks import FinalOpsHook
 from tensorflow.python.training.basic_session_run_hooks import FeedFnHook
 from tensorflow.python.training.basic_session_run_hooks import ProfilerHook
 from tensorflow.python.training.basic_loops import basic_train_loop
+from tensorflow.python.training.checkpointable_utils import Checkpoint
 from tensorflow.python.training.checkpoint_utils import init_from_checkpoint
 from tensorflow.python.training.checkpoint_utils import list_variables
 from tensorflow.python.training.checkpoint_utils import load_checkpoint
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt
new file mode 100644
index 0000000000..17f393d27c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.train.Checkpoint"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.checkpointable_utils.Checkpoint\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "save_counter"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "restore"
+    argspec: "args=[\'self\', \'save_path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'file_prefix\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
index bec72e1e60..9fb18e77af 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
@@ -20,6 +20,10 @@ tf_module {
     name: "BytesList"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
+  member {
+    name: "Checkpoint"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CheckpointSaverHook"
     mtype: "<type \'type\'>"
-- 
GitLab


From f089ef66f6e357e4a814ad4757e46bf88cf11bb6 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 18 Apr 2018 17:04:46 -0700
Subject: [PATCH 110/450] Add a ten-second timeout to the DeleteWorkerSession
 call.

Previously, `MasterSession::Close()` did not block on the cleanup RPCs
to the individual workers, leading to deployments where the remote
workers might be shut down (e.g. by an external mechanism) before the
session was closed. In order to switch over to using
DeleteWorkerSession for all sessions, and preserve backwards
compatibility, we need to permit this behavior. Therefore, this CL
adds a 10-second timeout on the requests to workers, and logs an error
if the request does not succeed in that time period.

PiperOrigin-RevId: 193441618
---
 .../core/distributed_runtime/master_session.cc |  9 +++++++--
 .../rpc/grpc_remote_worker.cc                  |  6 ++++--
 tensorflow/core/distributed_runtime/worker.cc  |  3 ++-
 tensorflow/core/distributed_runtime/worker.h   |  3 ++-
 .../distributed_runtime/worker_interface.h     | 18 ++++++++++++++++--
 5 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 08020f0266..7868200fb4 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -1273,6 +1273,8 @@ Status MasterSession::DeleteWorkerSessions() {
     // The worker referenced by name. (Not owned.)
     WorkerInterface* worker = nullptr;
 
+    CallOptions call_opts;
+
     // Request and responses used for a given worker.
     DeleteWorkerSessionRequest request;
     DeleteWorkerSessionResponse response;
@@ -1296,6 +1298,9 @@ Status MasterSession::DeleteWorkerSessions() {
     workers[i].name = &worker_names[i];
     workers[i].worker = worker_cache->CreateWorker(worker_names[i]);
     workers[i].request.set_session_handle(handle_);
+    // Since the worker may have gone away, set a timeout to avoid blocking the
+    // session-close operation.
+    workers[i].call_opts.SetTimeout(10000);
   }
 
   for (size_t i = 0; i < worker_names.size(); ++i) {
@@ -1303,8 +1308,8 @@ Status MasterSession::DeleteWorkerSessions() {
       workers[i].status = s;
       done.DecrementCount();
     };
-    workers[i].worker->DeleteWorkerSessionAsync(&workers[i].request,
-                                                &workers[i].response, cb);
+    workers[i].worker->DeleteWorkerSessionAsync(
+        &workers[i].call_opts, &workers[i].request, &workers[i].response, cb);
   }
 
   done.Wait();
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index b3b05408b1..895bbd97b7 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -72,10 +72,12 @@ class GrpcRemoteWorker : public WorkerInterface {
     IssueRequest(request, response, createworkersession_, std::move(done));
   }
 
-  void DeleteWorkerSessionAsync(const DeleteWorkerSessionRequest* request,
+  void DeleteWorkerSessionAsync(CallOptions* call_opts,
+                                const DeleteWorkerSessionRequest* request,
                                 DeleteWorkerSessionResponse* response,
                                 StatusCallback done) override {
-    IssueRequest(request, response, deleteworkersession_, std::move(done));
+    IssueRequest(request, response, deleteworkersession_, std::move(done),
+                 call_opts);
   }
 
   void RegisterGraphAsync(const RegisterGraphRequest* request,
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 6b2536c3c0..e9073ef9f6 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -49,7 +49,8 @@ void Worker::CreateWorkerSessionAsync(const CreateWorkerSessionRequest* request,
   done(s);
 }
 
-void Worker::DeleteWorkerSessionAsync(const DeleteWorkerSessionRequest* request,
+void Worker::DeleteWorkerSessionAsync(CallOptions* opts,
+                                      const DeleteWorkerSessionRequest* request,
                                       DeleteWorkerSessionResponse* response,
                                       StatusCallback done) {
   Status s = env_->session_mgr->DeleteSession(request->session_handle());
diff --git a/tensorflow/core/distributed_runtime/worker.h b/tensorflow/core/distributed_runtime/worker.h
index 62fa5f3cf5..19aeeb752c 100644
--- a/tensorflow/core/distributed_runtime/worker.h
+++ b/tensorflow/core/distributed_runtime/worker.h
@@ -52,7 +52,8 @@ class Worker : public WorkerInterface {
                                 CreateWorkerSessionResponse* response,
                                 StatusCallback done) override;
 
-  void DeleteWorkerSessionAsync(const DeleteWorkerSessionRequest* request,
+  void DeleteWorkerSessionAsync(CallOptions* opts,
+                                const DeleteWorkerSessionRequest* request,
                                 DeleteWorkerSessionResponse* response,
                                 StatusCallback done) override;
 
diff --git a/tensorflow/core/distributed_runtime/worker_interface.h b/tensorflow/core/distributed_runtime/worker_interface.h
index 4c58bf41a4..a1597ee798 100644
--- a/tensorflow/core/distributed_runtime/worker_interface.h
+++ b/tensorflow/core/distributed_runtime/worker_interface.h
@@ -45,7 +45,7 @@ class WorkerInterface {
       CreateWorkerSessionResponse* response, StatusCallback done) = 0;
 
   virtual void DeleteWorkerSessionAsync(
-      const DeleteWorkerSessionRequest* request,
+      CallOptions* opts, const DeleteWorkerSessionRequest* request,
       DeleteWorkerSessionResponse* response, StatusCallback done) = 0;
 
   virtual void RegisterGraphAsync(const RegisterGraphRequest* request,
@@ -124,7 +124,8 @@ class WorkerInterface {
 
   Status DeleteWorkerSession(const DeleteWorkerSessionRequest* request,
                              DeleteWorkerSessionResponse* response) {
-    return CallAndWait(&ME::DeleteWorkerSessionAsync, request, response);
+    return CallAndWaitWithOptions(&ME::DeleteWorkerSessionAsync, request,
+                                  response);
   }
 
   Status RegisterGraph(const RegisterGraphRequest* request,
@@ -183,6 +184,19 @@ class WorkerInterface {
     n.WaitForNotification();
     return ret;
   }
+
+  template <typename Method, typename Req, typename Resp>
+  Status CallAndWaitWithOptions(Method func, const Req* req, Resp* resp) {
+    CallOptions call_opts;
+    Status ret;
+    Notification n;
+    (this->*func)(&call_opts, req, resp, [&ret, &n](const Status& s) {
+      ret = s;
+      n.Notify();
+    });
+    n.WaitForNotification();
+    return ret;
+  }
 };
 
 }  // namespace tensorflow
-- 
GitLab


From b23415e3f3c34c3911e4e05758a41a81e5882453 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 17:05:04 -0700
Subject: [PATCH 111/450] Replace space in "Fraction of Zero Values" with _
 because using space is illegal and will be auto replaced.

PiperOrigin-RevId: 193441676
---
 tensorflow/contrib/slim/python/slim/summaries.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/slim/python/slim/summaries.py b/tensorflow/contrib/slim/python/slim/summaries.py
index 358359d6eb..a7dc3f6723 100644
--- a/tensorflow/contrib/slim/python/slim/summaries.py
+++ b/tensorflow/contrib/slim/python/slim/summaries.py
@@ -144,7 +144,7 @@ def add_zero_fraction_summary(tensor, name=None, prefix=None,
     A scalar `Tensor` of type `string` whose contents are the serialized
     `Summary` protocol buffer.
   """
-  name = _get_summary_name(tensor, name, prefix, 'Fraction of Zero Values')
+  name = _get_summary_name(tensor, name, prefix, 'Fraction_of_Zero_Values')
   tensor = nn.zero_fraction(tensor)
   return add_scalar_summary(tensor, name, print_summary=print_summary)
 
-- 
GitLab


From 8cfbbafc17c8baaad47f2a12508c3bee9c8fcda4 Mon Sep 17 00:00:00 2001
From: fo40225 <fo40225@users.noreply.github.com>
Date: Thu, 12 Apr 2018 09:41:48 +0800
Subject: [PATCH 112/450] fix tf.GIT_VERSION always 'unknown' on windows cmake
 build (#16730)

---
 .../contrib/cmake/tf_core_framework.cmake     |  2 +-
 tensorflow/tools/git/gen_git_source.py        | 37 +++++++++++++------
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index 73cadc58ff..973c191c47 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -276,7 +276,7 @@ add_custom_command(OUTPUT __force_rebuild COMMAND ${CMAKE_COMMAND} -E echo)
 add_custom_command(OUTPUT
     ${VERSION_INFO_CC}
     COMMAND ${PYTHON_EXECUTABLE} ${tensorflow_source_dir}/tensorflow/tools/git/gen_git_source.py
-    ARGS --raw_generate ${VERSION_INFO_CC} --git_tag_override=${GIT_TAG_OVERRIDE}
+    ARGS --raw_generate ${VERSION_INFO_CC} --source_dir ${tensorflow_source_dir} --git_tag_override=${GIT_TAG_OVERRIDE}
     DEPENDS __force_rebuild)
 set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
 
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index 7f0f325119..2151a75e84 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -164,18 +164,14 @@ def get_git_version(git_base_path, git_tag_override):
         "git", str("--git-dir=%s/.git" % git_base_path),
         str("--work-tree=" + git_base_path), "describe", "--long", "--tags"
     ]).strip())
-    if git_tag_override and val:
+    if git_tag_override:
       split_val = val.split("-")
-      if len(split_val) < 3:
+      if len(split_val) != 3:
         raise Exception(
             ("Expected git version in format 'TAG-COMMITS AFTER TAG-HASH' "
              "but got '%s'") % val)
-      # There might be "-" in the tag name. But we can be sure that the final
-      # two "-" are those inserted by the git describe command.
-      commits_ahead_of_tag = split_val[-2]
-      abbrev_commit = split_val[-1]
-      val = bytes(
-          "-".join([git_tag_override, commits_ahead_of_tag, abbrev_commit]))
+      split_val[0] = git_tag_override
+      val = bytes("-".join(split_val))
     return val if val else unknown_label
   except subprocess.CalledProcessError:
     return unknown_label
@@ -193,7 +189,15 @@ def write_version_info(filename, git_version):
   contents = """/*  Generated by gen_git_source.py  */
 #include <string>
 const char* tf_git_version() {return "%s";}
-const char* tf_compiler_version() {return __VERSION__;}
+const char* tf_compiler_version() {
+#ifdef _MSC_VER
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+  return "MSVC " TOSTRING(_MSC_FULL_VER);
+#else
+  return __VERSION__;
+#endif
+}
 const int tf_cxx11_abi_flag() {
 #ifdef _GLIBCXX_USE_CXX11_ABI
   return _GLIBCXX_USE_CXX11_ABI;
@@ -257,7 +261,7 @@ def generate(arglist, git_tag_override=None):
   write_version_info(dest_file, git_version)
 
 
-def raw_generate(output_file, git_tag_override=None):
+def raw_generate(output_file, source_dir, git_tag_override=None):
   """Simple generator used for cmake/make build systems.
 
   This does not create any symlinks. It requires the build system
@@ -265,12 +269,13 @@ def raw_generate(output_file, git_tag_override=None):
 
   Args:
     output_file: Output filename for the version info cc
+    source_dir: Base path of the source code
     git_tag_override: Override the value for the git tag. This is useful for
       releases where we want to build the release before the git tag is
       created.
   """
 
-  git_version = get_git_version(".", git_tag_override)
+  git_version = get_git_version(source_dir, git_tag_override)
   write_version_info(output_file, git_version)
 
 
@@ -308,6 +313,11 @@ parser.add_argument(
     type=str,
     help="Generate version_info.cc (simpler version used for cmake/make)")
 
+parser.add_argument(
+    "--source_dir",
+    type=str,
+    help="Base path of the source code (used for cmake/make)")
+
 args = parser.parse_args()
 
 if args.configure is not None:
@@ -317,7 +327,10 @@ if args.configure is not None:
 elif args.generate is not None:
   generate(args.generate, args.git_tag_override)
 elif args.raw_generate is not None:
-  raw_generate(args.raw_generate, args.git_tag_override)
+  source_path = "."
+  if args.source_dir is not None:
+    source_path = args.source_dir
+  raw_generate(args.raw_generate, source_path, args.git_tag_override)
 else:
   raise RuntimeError("--configure or --generate or --raw_generate "
                      "must be used")
-- 
GitLab


From d961d8ffae1500aca0c6191e4b1e37a2a44bf527 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 17:09:06 -0700
Subject: [PATCH 113/450] Fix reference name.

PiperOrigin-RevId: 193442269
---
 tensorflow/contrib/autograph/pyct/static_analysis/type_info.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
index 763997968c..c00946f9c4 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
@@ -199,8 +199,7 @@ class TypeInfoResolver(transformer.Base):
         target_symbol = anno.getanno(target, anno.Basic.QN)
         self.scope.setval(target_symbol, source)
       else:
-        raise ValueError(
-            'assignment target has unknown type: %s' % target_item)
+        raise ValueError('assignment target has unknown type: %s' % target)
 
   def visit_With(self, node):
     for wi in node.items:
-- 
GitLab


From 8d48dabb309dfc4ad1e06286b6e77c7258802e56 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 17:18:52 -0700
Subject: [PATCH 114/450] Update ops-related pbtxt files.

PiperOrigin-RevId: 193443417
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 28 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 28 +++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 031932d79f..d741e2ad46 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -55051,6 +55051,34 @@ op {
     }
   }
 }
+op {
+  name: "SetStatsAggregatorDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "stats_aggregator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Shape"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 4ae1c3d7e0..beda05fdf2 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -25720,6 +25720,34 @@ op {
     }
   }
 }
+op {
+  name: "SetStatsAggregatorDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "stats_aggregator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Shape"
   input_arg {
-- 
GitLab


From 558b3d35f080163b4f8cf8b4997d9e2cc0c4fd6e Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Wed, 18 Apr 2018 17:42:42 -0700
Subject: [PATCH 115/450] Fix merge.

---
 tensorflow/tools/git/gen_git_source.py | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index 2151a75e84..6ec162e4a9 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -164,14 +164,17 @@ def get_git_version(git_base_path, git_tag_override):
         "git", str("--git-dir=%s/.git" % git_base_path),
         str("--work-tree=" + git_base_path), "describe", "--long", "--tags"
     ]).strip())
-    if git_tag_override:
+    if git_tag_override and val:
       split_val = val.split("-")
-      if len(split_val) != 3:
+      if len(split_val) < 3:
         raise Exception(
             ("Expected git version in format 'TAG-COMMITS AFTER TAG-HASH' "
              "but got '%s'") % val)
-      split_val[0] = git_tag_override
-      val = bytes("-".join(split_val))
+      # There might be "-" in the tag name. But we can be sure that the final
+      # two "-" are those inserted by the git describe command.
+      abbrev_commit = split_val[-1]
+      val = bytes(
+          "-".join([git_tag_override, "0", abbrev_commit]))
     return val if val else unknown_label
   except subprocess.CalledProcessError:
     return unknown_label
@@ -189,15 +192,7 @@ def write_version_info(filename, git_version):
   contents = """/*  Generated by gen_git_source.py  */
 #include <string>
 const char* tf_git_version() {return "%s";}
-const char* tf_compiler_version() {
-#ifdef _MSC_VER
-#define STRINGIFY(x) #x
-#define TOSTRING(x) STRINGIFY(x)
-  return "MSVC " TOSTRING(_MSC_FULL_VER);
-#else
-  return __VERSION__;
-#endif
-}
+const char* tf_compiler_version() {return __VERSION__;}
 const int tf_cxx11_abi_flag() {
 #ifdef _GLIBCXX_USE_CXX11_ABI
   return _GLIBCXX_USE_CXX11_ABI;
@@ -333,4 +328,4 @@ elif args.raw_generate is not None:
   raw_generate(args.raw_generate, source_path, args.git_tag_override)
 else:
   raise RuntimeError("--configure or --generate or --raw_generate "
-                     "must be used")
+                     "must be used")
\ No newline at end of file
-- 
GitLab


From dc0f44a98284e1bd8f9d44ef7a8122b27f9f0f15 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 17:46:46 -0700
Subject: [PATCH 116/450] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 193446519

---
 tensorflow/go/op/wrappers.go | 130 +++++++++++++++--------------------
 1 file changed, 57 insertions(+), 73 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 1d4b1399ed..a5b293ce75 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -7564,22 +7564,6 @@ func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, li
 	return scope.AddOperation(opspec)
 }
 
-// Associates the given iterator with the given statistics aggregator.
-//
-// Returns the created operation.
-func IteratorSetStatsAggregator(scope *Scope, iterator_handle tf.Output, stats_aggregator_handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IteratorSetStatsAggregator",
-		Input: []tf.Input{
-			iterator_handle, stats_aggregator_handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
 type DataFormatVecPermuteAttr func(optionalAttr)
 
@@ -24288,6 +24272,63 @@ func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, o
 	return op.Output(0)
 }
 
+// MapSizeAttr is an optional argument to MapSize.
+type MapSizeAttr func(optionalAttr)
+
+// MapSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapSizeCapacity(value int64) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapSizeMemoryLimit(value int64) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapSizeContainer(value string) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapSizeSharedName(value string) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of elements in the underlying container.
+func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Convert JSON-encoded Example records to binary protocol buffer strings.
 //
 // This op translates a tensor containing Example records, encoded using
@@ -28128,63 +28169,6 @@ func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.Data
 	return values
 }
 
-// MapSizeAttr is an optional argument to MapSize.
-type MapSizeAttr func(optionalAttr)
-
-// MapSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapSizeCapacity(value int64) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapSizeMemoryLimit(value int64) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapSizeContainer(value string) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapSizeSharedName(value string) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of elements in the underlying container.
-func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapSize",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
 type MapIncompleteSizeAttr func(optionalAttr)
 
-- 
GitLab


From d4976f754009d084514f4308d3bfc7dc3a106e29 Mon Sep 17 00:00:00 2001
From: Yao Zhang <yaozhang@google.com>
Date: Wed, 18 Apr 2018 17:48:49 -0700
Subject: [PATCH 117/450] Enable for all gpus.

PiperOrigin-RevId: 193446717
---
 tensorflow/core/grappler/optimizers/layout_optimizer.cc | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 8fb30d116d..db83580c1c 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -2132,14 +2132,7 @@ int GetNumGPUs(const Cluster& cluster) {
   int num_gpus = 0;
   for (const auto& device : devices) {
     if (device.second.type() == "GPU") {
-      if (device.second.environment().find("architecture") !=
-          device.second.environment().end()) {
-        const string arch = device.second.environment().at("architecture");
-        // TODO(yaozhang): Enable for Volta GPUs (compute capability version 7).
-        if (arch < "7") {
-          num_gpus++;
-        }
-      }
+      num_gpus++;
     }
   }
   return num_gpus;
-- 
GitLab


From f1fb08bbb70047af0c86cc440ccc0581e64fd85f Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Wed, 18 Apr 2018 18:04:44 -0700
Subject: [PATCH 118/450] Various lint fixes to TensorFlow detected after
 GitHub merge.

PiperOrigin-RevId: 193448139
---
 .../contrib/data/python/ops/resampling.py     |  1 -
 .../contrib/layers/python/layers/layers.py    | 10 ++++++-
 .../kernel_tests/attention_wrapper_test.py    | 12 ++++-----
 .../core/kernels/mkl_input_conversion_op.cc   | 12 ++++++---
 tensorflow/java/src/gen/cc/source_writer.h    |  2 +-
 tensorflow/python/ops/control_flow_ops.py     | 10 +++----
 tensorflow/python/ops/data_flow_ops.py        | 27 ++++++++++---------
 tensorflow/python/training/session_manager.py |  2 --
 tensorflow/tools/pip_package/setup.py         | 10 ++++---
 9 files changed, 50 insertions(+), 36 deletions(-)

diff --git a/tensorflow/contrib/data/python/ops/resampling.py b/tensorflow/contrib/data/python/ops/resampling.py
index b465397437..a182dddd38 100644
--- a/tensorflow/contrib/data/python/ops/resampling.py
+++ b/tensorflow/contrib/data/python/ops/resampling.py
@@ -110,7 +110,6 @@ def rejection_resample(class_func, target_dist, initial_dist=None, seed=None):
         .filter(lambda _1, p, _2: random_ops.random_uniform([], seed=seed) < p))
     return filtered_ds.map(lambda class_value, _, data: (class_value, data))
 
-
   return _apply_fn
 
 
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 10d7f6d076..25c3b1e7ea 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -1404,6 +1404,7 @@ def convolution3d_transpose(
 @add_arg_scope
 def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None):
   """Converts a dense tensor into a sparse tensor.
+
   An example use would be to convert dense labels to sparse ones
   so that they can be fed to the ctc_loss.
 
@@ -2191,11 +2192,16 @@ def images_to_sequence(inputs,
                        outputs_collections=None,
                        scope=None):
   """Convert a batch of images into a batch of sequences.
+
   Args:
     inputs: a (num_images, height, width, depth) tensor
     data_format: A string. `NHWC` (default) and `NCHW` are supported.
     outputs_collections: The collections to which the outputs are added.
     scope: Optional scope for name_scope.
+
+  Raises:
+     ValueError: If `data_format` is not either NCHW or NHWC.
+
   Returns:
     (width, num_images*height, depth) sequence tensor
   """
@@ -2701,6 +2707,7 @@ def sequence_to_images(inputs,
                        outputs_collections=None,
                        scope=None):
   """Convert a batch of sequences into a batch of images.
+
   Args:
     inputs: (num_steps, num_batches, depth) sequence tensor
     height: the height of the images
@@ -2708,6 +2715,7 @@ def sequence_to_images(inputs,
       Currently supports `'channels_first'` and `'channels_last'`.
     outputs_collections: The collections to which the outputs are added.
     scope: Optional scope for name_scope.
+
   Returns:
     A tensor representing the output of the operation.
   """
@@ -2717,7 +2725,7 @@ def sequence_to_images(inputs,
     if num_batches is None:
       num_batches = -1
     else:
-      num_batches = num_batches // height
+      num_batches //= height
     reshaped = array_ops.reshape(inputs,
                                  [width, num_batches, height, depth])
     if output_data_format == 'channels_first':
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index d508cf3f9d..0232103c41 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -355,11 +355,11 @@ class AttentionWrapperTest(test.TestCase):
 
   def testLuongScaledDType(self):
     # Test case for GitHub issue 18099
-    for dtype in [np.float16, np.float32, np.float64]:
+    for dt in [np.float16, np.float32, np.float64]:
       num_units = 128
-      encoder_outputs = array_ops.placeholder(dtype, shape=[64, None, 256])
+      encoder_outputs = array_ops.placeholder(dt, shape=[64, None, 256])
       encoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
-      decoder_inputs = array_ops.placeholder(dtype, shape=[64, None, 128])
+      decoder_inputs = array_ops.placeholder(dt, shape=[64, None, 128])
       decoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
       batch_size = 64
       attention_mechanism = wrapper.LuongAttention(
@@ -367,7 +367,7 @@ class AttentionWrapperTest(test.TestCase):
           memory=encoder_outputs,
           memory_sequence_length=encoder_sequence_length,
           scale=True,
-          dtype=dtype,
+          dtype=dt,
       )
       cell = rnn_cell.LSTMCell(num_units)
       cell = wrapper.AttentionWrapper(cell, attention_mechanism)
@@ -378,12 +378,12 @@ class AttentionWrapperTest(test.TestCase):
           cell=cell,
           helper=helper,
           initial_state=cell.zero_state(
-              dtype=dtype, batch_size=batch_size))
+              dtype=dt, batch_size=batch_size))
 
       final_outputs, final_state, _ = decoder.dynamic_decode(my_decoder)
       self.assertTrue(
           isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
-      self.assertEqual(final_outputs.rnn_output.dtype, dtype)
+      self.assertEqual(final_outputs.rnn_output.dtype, dt)
       self.assertTrue(
           isinstance(final_state, wrapper.AttentionWrapperState))
       self.assertTrue(
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index 68d3e1c9ab..dcf6bb9f74 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -291,7 +291,8 @@ class MklInputConversionOp : public OpKernel {
     // If both inputs are in MKL format
     if (input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
       // It is safer to compare the original TensorFlow shapes than to compare
-      // Mkl shapes since element wise ops are forwarded to Eigen implementation.
+      // Mkl shapes since element wise ops are forwarded to Eigen
+      // implementation.
       TensorShape tf_shape0 = input_shape_0.GetTfShape();
       TensorShape tf_shape1 = input_shape_1.GetTfShape();
       if (tf_shape0 == tf_shape1) {
@@ -362,9 +363,11 @@ class MklInputConversionOp : public OpKernel {
               << "converted MKL inputs to TF format";
 
       MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
-                                           op_data_type, has_avx512f_, kInputIndex_0);
+                                           op_data_type, has_avx512f_,
+                                           kInputIndex_0);
       MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
-                                           op_data_type, has_avx512f_, kInputIndex_1);
+                                           op_data_type, has_avx512f_,
+                                           kInputIndex_1);
       SetDummyMklShapeOutput(context, kInputIndex_0);
       SetDummyMklShapeOutput(context, kInputIndex_1);
       return;
@@ -464,7 +467,8 @@ class MklInputConversionOp : public OpKernel {
     }
 
     VLOG(1) << "MklInputConversionOp: Shapes (output): "
-            << context->mutable_output(kInputIndex_0)->shape().DebugString() << " and "
+            << context->mutable_output(kInputIndex_0)->shape().DebugString()
+            << " and "
             << context->mutable_output(kInputIndex_1)->shape().DebugString();
 
     VLOG(1) << "MklInputConversion completed successfully.";
diff --git a/tensorflow/java/src/gen/cc/source_writer.h b/tensorflow/java/src/gen/cc/source_writer.h
index 637072c0df..f011acd30a 100644
--- a/tensorflow/java/src/gen/cc/source_writer.h
+++ b/tensorflow/java/src/gen/cc/source_writer.h
@@ -61,7 +61,7 @@ class SourceWriter {
   // The data might potentially contain newline characters, therefore it will
   // be scanned to ensure that each line is indented and prefixed properly,
   // making it a bit slower than Append().
-  SourceWriter& Write(const StringPiece& text);
+  SourceWriter& Write(const StringPiece& str);
 
   // Writes a source code snippet read from a file.
   //
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index c43bbd4a1e..a1bfe450c8 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -609,13 +609,13 @@ def _EnforceShapeInvariant(merge_var, next_var):
   """Check if the shapes of the loops variables are invariants.
 
   Args:
-    merge_vars: The list of tensors representing the initial values of the
+    merge_var: The list of tensors representing the initial values of the
       loop variables.
-    next_vars: The list of tensors representing the values of the loop
+    next_var: The list of tensors representing the values of the loop
       variables after one loop iteration.
 
   Raises:
-    ValueError: If any tensor in `merge_vars` has a more specific shape than
+    ValueError: If any tensor in `merge_var` has a more specific shape than
       its correspnding tensor in `next_var`.
   """
   if isinstance(merge_var, ops.Tensor):
@@ -833,7 +833,7 @@ class GradLoopState(object):
     if outer_grad_state:
       outer_forward_ctxt = outer_grad_state.forward_context
     else:
-      if not hasattr(forward_ctxt, 'outer_context'):
+      if not hasattr(forward_ctxt, "outer_context"):
         raise ValueError("Failed to call gradients on a while loop without"
                          "properly serializing graph via MetaGraphDef")
       outer_forward_ctxt = forward_ctxt.outer_context
@@ -2973,7 +2973,7 @@ class WhileContext(ControlFlowContext):
     packed_exit_vars = nest.pack_sequence_as(
         structure=original_body_result,
         flat_sequence=exit_vars_with_tensor_arrays)
-    return (packed_exit_vars[0] if len(exit_vars) == 1 else packed_exit_vars)
+    return packed_exit_vars[0] if len(exit_vars) == 1 else packed_exit_vars
 
   def _FixControlInputsAndContext(self, enters):
     graph = ops.get_default_graph()
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index cb725199a8..62c5adc385 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -571,7 +571,7 @@ class QueueBase(object):
           name=name)
 
   def is_closed(self, name=None):
-    """ Returns true if queue is closed.
+    """Returns true if queue is closed.
 
     This operation returns true if the queue is closed and false if the queue
     is open.
@@ -1563,7 +1563,7 @@ class BaseStagingArea(object):
     of the staging area.
 
     Args:
-      vals: A tensor, a list or tuple of tensors, or a dictionary..
+      vals: A tensor, a list or tuple of tensors, or a dictionary.
 
     Returns:
       A (tensors, indices) tuple where `tensors` is a list of `Tensor` objects
@@ -1582,7 +1582,7 @@ class BaseStagingArea(object):
                          (sorted(vals.keys()), sorted(self._names)))
       # The order of values in `self._names` indicates the order in which the
       # tensors in the dictionary `vals` must be listed.
-      vals, indices, n = zip(*[(vals[k], i, k)
+      vals, indices, _ = zip(*[(vals[k], i, k)
                                for i, k in enumerate(self._names)
                                if k in vals])
     else:
@@ -1612,7 +1612,7 @@ class BaseStagingArea(object):
     for val, i in zip(vals, indices):
       dtype, shape = self._dtypes[i], self._shapes[i]
       # Check dtype
-      if not val.dtype == dtype:
+      if val.dtype != dtype:
         raise ValueError("Datatypes do not match. '%s' != '%s'" %
                          (str(val.dtype), str(dtype)))
 
@@ -1626,7 +1626,7 @@ class BaseStagingArea(object):
 
   def _create_device_transfers(self, tensors):
     """Encode inter-device transfers if the current device
-    is not the same as the Staging Area's device
+    is not the same as the Staging Area's device.
     """
 
     if not isinstance(tensors, (tuple, list)):
@@ -1739,11 +1739,6 @@ class StagingArea(BaseStagingArea):
     Args:
       dtypes:  A list of types.  The length of dtypes must equal the number
         of tensors in each element.
-      capacity: (Optional.) Maximum number of elements.
-        An integer. If zero, the Staging Area is unbounded
-      memory_limit: (Optional.) Maximum number of bytes of all tensors
-        in the Staging Area.
-        An integer. If zero, the Staging Area is unbounded
       shapes: (Optional.) Constraints on the shapes of tensors in an element.
         A list of shape tuples or None. This list is the same length
         as dtypes.  If the shape of any tensors in the element are constrained,
@@ -1754,6 +1749,11 @@ class StagingArea(BaseStagingArea):
       shared_name: (Optional.) A name to be used for the shared object. By
         passing the same name to two different python objects they will share
         the underlying staging area. Must be a string.
+      capacity: (Optional.) Maximum number of elements.
+        An integer. If zero, the Staging Area is unbounded
+      memory_limit: (Optional.) Maximum number of bytes of all tensors
+        in the Staging Area.
+        An integer. If zero, the Staging Area is unbounded
 
     Raises:
       ValueError: If one of the arguments is invalid.
@@ -1782,7 +1782,7 @@ class StagingArea(BaseStagingArea):
     """
     with ops.name_scope(name, "%s_put" % self._name,
                         self._scope_vals(values)) as scope:
-      
+
       if not isinstance(values, (list, tuple, dict)):
         values = [values]
 
@@ -1911,7 +1911,8 @@ class StagingArea(BaseStagingArea):
 
 
 class MapStagingArea(BaseStagingArea):
-  """A `MapStagingArea` is a TensorFlow data structure that stores tensors across multiple steps, and exposes operations that can put and get tensors.
+  """A `MapStagingArea` is a TensorFlow data structure that stores tensors
+  across multiple steps, and exposes operations that can put and get tensors.
 
   Each `MapStagingArea` element is a (key, value) pair.
   Only int64 keys are supported, other types should be
@@ -2375,7 +2376,7 @@ class RecordInput(object):
       return records
     else:
       with ops.name_scope(self._name):
-        batch_list = [[] for i in six.moves.range(self._batches)]
+        batch_list = [[] for _ in six.moves.range(self._batches)]
         records = array_ops.split(records, self._batch_size, 0)
         records = [array_ops.reshape(record, []) for record in records]
         for index, protobuf in zip(six.moves.range(len(records)), records):
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index a00ceb9021..3cb3877cc2 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -263,8 +263,6 @@ class SessionManager(object):
 
     Raises:
       RuntimeError: If the model cannot be initialized or recovered.
-
-    Raises:
       ValueError: If both checkpoint_dir and checkpoint_filename_with_path are
         set.
     """
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 6511a50b3b..211f93296b 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -22,7 +22,9 @@ import os
 import re
 import sys
 
-from setuptools import find_packages, setup, Command
+from setuptools import Command
+from setuptools import find_packages
+from setuptools import setup
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
@@ -97,7 +99,9 @@ TEST_PACKAGES = [
     'scipy >= 0.15.1',
 ]
 
+
 class BinaryDistribution(Distribution):
+
   def has_ext_modules(self):
     return True
 
@@ -179,9 +183,9 @@ class InstallHeaders(Command):
 
 def find_files(pattern, root):
   """Return all the files matching pattern below root dir."""
-  for path, _, files in os.walk(root):
+  for dirpath, _, files in os.walk(root):
     for filename in fnmatch.filter(files, pattern):
-      yield os.path.join(path, filename)
+      yield os.path.join(dirpath, filename)
 
 
 matches = ['../' + x for x in find_files('*', 'external') if '.py' not in x]
-- 
GitLab


From a699d69c621fde118d4c89ba94658a9d7f91faac Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Wed, 18 Apr 2018 18:49:02 -0700
Subject: [PATCH 119/450] [TF TensorLists] Add TensorListConcatLists

TensorListConcat concatenates two TensorLists' entries (supports non-scalar
Tensors containing TensorLists).

PiperOrigin-RevId: 193451787
---
 .../api_def_TensorListConcatLists.pbtxt       |  3 +
 tensorflow/core/kernels/list_kernels.cc       | 93 +++++++++++++++++++
 tensorflow/core/ops/list_ops.cc               | 41 ++++++++
 .../python/kernel_tests/list_ops_test.py      | 60 ++++++++++++
 tensorflow/python/ops/list_ops.py             |  4 +
 5 files changed, 201 insertions(+)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_TensorListConcatLists.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListConcatLists.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListConcatLists.pbtxt
new file mode 100644
index 0000000000..3fa6265e10
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListConcatLists.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListConcatLists"
+}
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index d1e481d7cc..84fa63fc00 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -475,6 +475,99 @@ REGISTER_KERNEL_BUILDER(
 
 #endif  // GOOGLE_CUDA
 
+class TensorListConcatLists : public OpKernel {
+ public:
+  explicit TensorListConcatLists(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    const TensorShape& tl_a_shape = c->input(0).shape();
+    const TensorShape& tl_b_shape = c->input(1).shape();
+    OP_REQUIRES(
+        c, tl_a_shape == tl_b_shape,
+        errors::InvalidArgument("Incompatible input TensorList tensor shapes: ",
+                                tl_a_shape.DebugString(), " vs. ",
+                                tl_b_shape.DebugString()));
+    AllocatorAttributes attr;
+    std::unique_ptr<Tensor> tl_alias = c->forward_input(
+        0 /*input_index*/, 0 /*output_index*/, DT_VARIANT, tl_a_shape,
+        DEVICE_MEMORY /* input is always on DEVICE_MEMORY */, attr);
+
+    // tl_a may be aliased by tl_alias.
+    const Tensor& tl_a = c->input(0);
+    const Tensor& tl_b = c->input(1);
+
+    Tensor* output;
+    if (tl_alias) {
+      c->set_output(0, *tl_alias);
+      output = tl_alias.get();
+    } else {
+      attr.set_on_host(true);
+      OP_REQUIRES_OK(c, c->allocate_output(0, tl_a_shape, &output, attr));
+    }
+
+    auto output_t = output->flat<Variant>();
+    auto tl_a_t = tl_a.flat<Variant>();
+    auto tl_b_t = tl_b.flat<Variant>();
+
+    for (int64 b = 0; b < tl_a.NumElements(); ++b) {
+      const TensorList* l_a = tl_a_t(b).get<TensorList>();
+      const TensorList* l_b = tl_b_t(b).get<TensorList>();
+      OP_REQUIRES(
+          c, l_a != nullptr,
+          errors::InvalidArgument("input_a is not a TensorList at index ", b,
+                                  ".  Saw: '", tl_a_t(b).DebugString(), "'"));
+      OP_REQUIRES(
+          c, l_b != nullptr,
+          errors::InvalidArgument("input_b is not a TensorList at index ", b,
+                                  ".  Saw: '", tl_b_t(b).DebugString(), "'"));
+      OP_REQUIRES(c, l_a->element_dtype == element_dtype_,
+                  errors::InvalidArgument(
+                      "input_a[", b, "].dtype != element_dtype.  Saw: ",
+                      DataTypeString(l_a->element_dtype), " vs. ",
+                      DataTypeString(element_dtype_)));
+      OP_REQUIRES(c, l_b->element_dtype == element_dtype_,
+                  errors::InvalidArgument(
+                      "input_b[", b, "].dtype != element_dtype.  Saw: ",
+                      DataTypeString(l_b->element_dtype), " vs. ",
+                      DataTypeString(element_dtype_)));
+      OP_REQUIRES(c, l_a->element_shape.IsIdenticalTo(l_b->element_shape),
+                  errors::InvalidArgument(
+                      "input_a and input_b TensorList element shapes are not "
+                      "identical at index ",
+                      b, ".  Saw ", l_a->element_shape.DebugString(), " vs. ",
+                      l_b->element_shape.DebugString()));
+      if (tl_alias) {
+        TensorList* out = output_t(b).get<TensorList>();
+        DCHECK(out != nullptr) << "Expected output to alias input_a, but it "
+                                  "doesn't contain a TensorList at index "
+                               << b;
+        std::copy(l_b->tensors.begin(), l_b->tensors.end(),
+                  std::back_inserter(out->tensors));
+      } else {
+        TensorList out = *l_a;
+        std::copy(l_b->tensors.begin(), l_b->tensors.end(),
+                  std::back_inserter(out.tensors));
+        output_t(b) = std::move(out);
+      }
+    }
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("TensorListConcatLists").Device(DEVICE_CPU),
+                        TensorListConcatLists);
+
+#if GOOGLE_CUDA
+
+REGISTER_KERNEL_BUILDER(Name("TensorListConcatLists").Device(DEVICE_GPU),
+                        TensorListConcatLists);
+
+#endif  // GOOGLE_CUDA
+
 #define REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(T)               \
   REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")         \
                               .TypeConstraint<T>("element_dtype") \
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index 7af70110b7..b9f94ba1c5 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -295,5 +295,46 @@ REGISTER_OP("TensorListSetItem")
       return Status::OK();
     });
 
+REGISTER_OP("TensorListConcatLists")
+    .Input("input_a: variant")
+    .Input("input_b: variant")
+    .Attr("element_dtype: type")
+    .Output("output: variant")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      auto input_a = c->input(0);
+      auto input_b = c->input(1);
+      TF_RETURN_IF_ERROR(c->Merge(input_a, input_b, &input_a));
+      c->set_output(0, input_a);
+
+      DataType t;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+
+      auto* handle_data_a = c->input_handle_shapes_and_types(0);
+      auto* handle_data_b = c->input_handle_shapes_and_types(1);
+      if (handle_data_a == nullptr && handle_data_b == nullptr) {
+        c->set_output_handle_shapes_and_types(0, {{c->UnknownShape(), t}});
+        return Status::OK();
+      }
+      shape_inference::ShapeAndType list_shape_type_a =
+          (handle_data_a) ? handle_data_a->at(0) : handle_data_b->at(0);
+      const shape_inference::ShapeAndType& list_shape_type_b =
+          (handle_data_b) ? handle_data_b->at(0) : handle_data_a->at(0);
+      if (list_shape_type_a.dtype != t) {
+        return errors::InvalidArgument("input_a.type != element_dtype: ",
+                                       DataTypeString(list_shape_type_a.dtype),
+                                       " vs. ", DataTypeString(t));
+      }
+      if (list_shape_type_b.dtype != t) {
+        return errors::InvalidArgument("input_b.type != element_dtype: ",
+                                       DataTypeString(list_shape_type_b.dtype),
+                                       " vs. ", DataTypeString(t));
+      }
+      TF_RETURN_IF_ERROR(c->Merge(list_shape_type_a.shape,
+                                  list_shape_type_b.shape,
+                                  &list_shape_type_a.shape));
+      c->set_output_handle_shapes_and_types(0, {list_shape_type_a});
+      return Status::OK();
+    });
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index 2084599760..098f9724a2 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -318,6 +318,66 @@ class ListOpsTest(test_util.TensorFlowTestCase):
                 [[1.0, 2.0]] * 4)
     self.assertAllEqual(self.evaluate(updated_v_stacked), expected)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testConcat(self):
+    c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
+    l0 = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=scalar_shape())
+    l_batch_0 = array_ops.stack([l0, l1])
+    l_batch_1 = array_ops.stack([l1, l0])
+
+    l_concat_01 = list_ops.tensor_list_concat_lists(
+        l_batch_0, l_batch_1, element_dtype=dtypes.float32)
+    l_concat_10 = list_ops.tensor_list_concat_lists(
+        l_batch_1, l_batch_0, element_dtype=dtypes.float32)
+    l_concat_00 = list_ops.tensor_list_concat_lists(
+        l_batch_0, l_batch_0, element_dtype=dtypes.float32)
+    l_concat_11 = list_ops.tensor_list_concat_lists(
+        l_batch_1, l_batch_1, element_dtype=dtypes.float32)
+
+    expected_00 = [[1.0, 2.0, 1.0, 2.0], [-1.0, -1.0]]
+    expected_01 = [[1.0, 2.0, -1.0], [-1.0, 1.0, 2.0]]
+    expected_10 = [[-1.0, 1.0, 2.0], [1.0, 2.0, -1.0]]
+    expected_11 = [[-1.0, -1.0], [1.0, 2.0, 1.0, 2.0]]
+
+    for i, (concat, expected) in enumerate(zip(
+        [l_concat_00, l_concat_01, l_concat_10, l_concat_11],
+        [expected_00, expected_01, expected_10, expected_11])):
+      splitted = array_ops.unstack(concat)
+      splitted_stacked_ret = self.evaluate(
+          (list_ops.tensor_list_stack(splitted[0], dtypes.float32),
+           list_ops.tensor_list_stack(splitted[1], dtypes.float32)))
+      print("Test concat %d: %s, %s, %s, %s"
+            % (i, expected[0], splitted_stacked_ret[0],
+               expected[1], splitted_stacked_ret[1]))
+      self.assertAllClose(expected[0], splitted_stacked_ret[0])
+      self.assertAllClose(expected[1], splitted_stacked_ret[1])
+
+    # Concatenating mismatched shapes fails.
+    with self.assertRaises((errors.InvalidArgumentError, ValueError)):
+      self.evaluate(
+          list_ops.tensor_list_concat_lists(
+              l_batch_0,
+              list_ops.empty_tensor_list(scalar_shape(), dtypes.float32),
+              element_dtype=dtypes.float32))
+
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "element shapes are not identical at index 0"):
+      l_batch_of_vec_tls = array_ops.stack(
+          [list_ops.tensor_list_from_tensor([[1.0]], element_shape=[1])] * 2)
+      self.evaluate(
+          list_ops.tensor_list_concat_lists(l_batch_0, l_batch_of_vec_tls,
+                                            element_dtype=dtypes.float32))
+
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r"input_b\[0\].dtype != element_dtype."):
+      l_batch_of_int_tls = array_ops.stack(
+          [list_ops.tensor_list_from_tensor([1], element_shape=scalar_shape())]
+          * 2)
+      self.evaluate(
+          list_ops.tensor_list_concat_lists(l_batch_0, l_batch_of_int_tls,
+                                            element_dtype=dtypes.float32))
+
   @test_util.run_in_graph_and_eager_modes()
   def testPushBackBatch(self):
     c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
index bdf0774bbf..d9ede87530 100644
--- a/tensorflow/python/ops/list_ops.py
+++ b/tensorflow/python/ops/list_ops.py
@@ -29,6 +29,10 @@ from tensorflow.python.ops.gen_list_ops import *
 # pylint: enable=wildcard-import
 
 
+ops.NotDifferentiable("TensorListConcat")
+ops.NotDifferentiable("TensorListPushBackBatch")
+
+
 @ops.RegisterGradient("TensorListPushBack")
 def _PushBackGrad(op, dresult):
   return gen_list_ops.tensor_list_pop_back(
-- 
GitLab


From cb35f8b702e6bf917b1d915346e959e76d1b1c1e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 18:49:37 -0700
Subject: [PATCH 120/450] Teach transpose folding about sharding.

PiperOrigin-RevId: 193451839
---
 tensorflow/compiler/xla/service/transpose_folding.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc
index 83185ac49e..3efd38ce0d 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding.cc
@@ -159,6 +159,7 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) {
 
   auto new_conv = HloInstruction::CreateConvolve(
       convolution.shape(), new_lhs, new_rhs, convolution.window(), new_dnums);
+  convolution.SetupDerivedInstruction(new_conv.get());
   TF_CHECK_OK(convolution.parent()->ReplaceWithNewInstruction(
       &convolution, std::move(new_conv)));
 
-- 
GitLab


From 6c85471ee06bf10f5034e2a8fb1fd6ab84dd7fbc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 19:18:33 -0700
Subject: [PATCH 121/450] Update ops-related pbtxt files.

PiperOrigin-RevId: 193454093
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 19 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 19 +++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index d741e2ad46..42a67bc4c8 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -68628,6 +68628,25 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorListConcatLists"
+  input_arg {
+    name: "input_a"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "input_b"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListElementShape"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index beda05fdf2..980e560601 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -31979,6 +31979,25 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorListConcatLists"
+  input_arg {
+    name: "input_a"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "input_b"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListElementShape"
   input_arg {
-- 
GitLab


From 542edb6dd64bd18d63ef1fd64c55a645c406f170 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 19:49:12 -0700
Subject: [PATCH 122/450] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 193456151

---
 tensorflow/go/op/wrappers.go | 212 +++++++++++++++++------------------
 1 file changed, 106 insertions(+), 106 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a5b293ce75..f270eadc32 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -13107,6 +13107,112 @@ func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, padd
 	return op.Output(0)
 }
 
+// Returns a list of tensors with the same shapes and contents as the input
+//
+// tensors.
+//
+// This op can be used to override the gradient for complicated functions. For
+// example, suppose y = f(x) and we wish to apply a custom function g for backprop
+// such that dx = g(dy). In Python,
+//
+// ```python
+// with tf.get_default_graph().gradient_override_map(
+//     {'IdentityN': 'OverrideGradientWithG'}):
+//   y, _ = identity_n([f(x), x])
+//
+// @tf.RegisterGradient('OverrideGradientWithG')
+// def ApplyG(op, dy, _):
+//   return [None, g(dy)]  # Do not backprop to f(x).
+// ```
+func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IdentityN",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("IdentityN", err)
+		return
+	}
+	return output
+}
+
+// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
+type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
+
+// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the centered RMSProp algorithm.
+//
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+//
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// mg <- rho * mg_{t-1} + (1-rho) * grad
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyCenteredRMSProp",
+		Input: []tf.Input{
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Adds `bias` to `value`.
 //
 // This is a deprecated version of BiasAdd and will be soon removed.
@@ -23167,112 +23273,6 @@ func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
 	return op.Output(0)
 }
 
-// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
-type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
-
-// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-//
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-//
-// mg <- rho * mg_{t-1} + (1-rho) * grad
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyCenteredRMSProp",
-		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns a list of tensors with the same shapes and contents as the input
-//
-// tensors.
-//
-// This op can be used to override the gradient for complicated functions. For
-// example, suppose y = f(x) and we wish to apply a custom function g for backprop
-// such that dx = g(dy). In Python,
-//
-// ```python
-// with tf.get_default_graph().gradient_override_map(
-//     {'IdentityN': 'OverrideGradientWithG'}):
-//   y, _ = identity_n([f(x), x])
-//
-// @tf.RegisterGradient('OverrideGradientWithG')
-// def ApplyG(op, dy, _):
-//   return [None, g(dy)]  # Do not backprop to f(x).
-// ```
-func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IdentityN",
-		Input: []tf.Input{
-			tf.OutputList(input),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("IdentityN", err)
-		return
-	}
-	return output
-}
-
 // Computes the gradient of the sigmoid of `x` wrt its input.
 //
 // Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
-- 
GitLab


From b7479a808477b61be0269048bf0cfad26070f832 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 20:02:52 -0700
Subject: [PATCH 123/450] Teach the reshape mover pass about sharding.

PiperOrigin-RevId: 193457083
---
 tensorflow/compiler/xla/service/reshape_mover.cc | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc
index 49ec38eb62..0f26a025bf 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover.cc
@@ -155,15 +155,20 @@ HloInstruction* UpdateOperand(const HloInstruction* first_reshape_operand,
     case HloOpcode::kConstant: {
       if (first_reshape_operand->opcode() == HloOpcode::kReshape) {
         VLOG(5) << "Adding reshape to kConstant operand";
-        return computation->AddInstruction(
+        HloInstruction* reshape = computation->AddInstruction(
             HloInstruction::CreateReshape(new_shape, operand));
+        operand->SetupDerivedInstruction(reshape);
+        return reshape;
       } else {
         CHECK(first_reshape_operand->opcode() == HloOpcode::kTranspose);
         VLOG(5) << "Adding transpose to kConstant operand";
         std::vector<int64> inverse_permutation =
             InversePermutation(first_reshape_operand->dimensions());
-        return computation->AddInstruction(HloInstruction::CreateTranspose(
-            new_shape, operand, inverse_permutation));
+        HloInstruction* transpose =
+            computation->AddInstruction(HloInstruction::CreateTranspose(
+                new_shape, operand, inverse_permutation));
+        operand->SetupDerivedInstruction(transpose);
+        return transpose;
       }
     }
     case HloOpcode::kRng: {
-- 
GitLab


From 81cabadc78811a216381fbf30715b1313684e32f Mon Sep 17 00:00:00 2001
From: Yao Zhang <yaozhang@google.com>
Date: Wed, 18 Apr 2018 20:04:56 -0700
Subject: [PATCH 124/450] Use the host implementation of vec permute op if the
 input on the host. Note that the op still needs to be placed on the GPU so
 that it stays within the same partiion with the neighboring ops, and as a
 result, no unnecessary send and recv are created.

PiperOrigin-RevId: 193457328
---
 .../grappler/optimizers/layout_optimizer.cc     | 17 ++++++++++-------
 .../optimizers/layout_optimizer_test.cc         |  2 +-
 tensorflow/core/kernels/data_format_ops.cc      |  9 ++++++++-
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index db83580c1c..87ab460862 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -909,7 +909,7 @@ class NodeProcessor : public GraphProcessor {
     list->set_i(3, w);
   }
 
-  string MaybeGetHostDevice(const string& input_name) const {
+  bool IsInputOnHost(const string& input_name) const {
     string device = node_->device();
     DeviceNameUtils::ParsedName parsed_name;
     if (DeviceNameUtils::ParseFullName(device, &parsed_name)) {
@@ -918,13 +918,11 @@ class NodeProcessor : public GraphProcessor {
         int port;
         ParseNodeName(input_name, &port);
         if (IsHostMemory(*input, port)) {
-          parsed_name.type = "CPU";
-          parsed_name.id = 0;
-          device = DeviceNameUtils::ParsedNameToString(parsed_name);
+          return true;
         }
       }
     }
-    return device;
+    return false;
   }
 
   NodeDef* AddNodeDataFormatOp(const string& name, const string& input_name,
@@ -934,9 +932,14 @@ class NodeProcessor : public GraphProcessor {
     added_node->set_name(name);
     added_node->set_op(op);
     node_map_->AddNode(added_node->name(), added_node);
+    added_node->set_device(node_->device());
     // The inputs of a DataFormat op could be in host memory for ops such as
-    // Reshape.
-    added_node->set_device(MaybeGetHostDevice(input_name));
+    // Reshape. In such cases, run the kernel on the host too.
+    if (IsInputOnHost(input_name)) {
+      AttrValue attr_kernel;
+      attr_kernel.set_s("host");
+      added_node->mutable_attr()->insert({"_kernel", attr_kernel});
+    }
     AttrValue attr_data_type;
     attr_data_type.set_type(dtype);
     added_node->mutable_attr()->insert({"T", attr_data_type});
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index e405c4c58c..fc87f69b8c 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -1174,7 +1174,7 @@ TEST_F(LayoutOptimizerTest, DevicePlacement) {
   NodeMap node_map(&output);
   auto vec_permute =
       node_map.GetNode("s-0-0-VecPermuteNCHWToNHWC-LayoutOptimizer");
-  EXPECT_TRUE(str_util::EndsWith(vec_permute->device(), "CPU:0"));
+  EXPECT_EQ(vec_permute->attr().at("_kernel").s(), "host");
 }
 }  // namespace
 }  // namespace grappler
diff --git a/tensorflow/core/kernels/data_format_ops.cc b/tensorflow/core/kernels/data_format_ops.cc
index 4485152e96..23319e6d0c 100644
--- a/tensorflow/core/kernels/data_format_ops.cc
+++ b/tensorflow/core/kernels/data_format_ops.cc
@@ -195,7 +195,14 @@ TF_CALL_int64(REGISTER_GPU_KERNEL);
 #define REGISTER_GPU_KERNEL(T)                                                \
   REGISTER_KERNEL_BUILDER(                                                    \
       Name("DataFormatVecPermute").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
-      DataFormatVecPermuteOp<GPUDevice, T>);
+      DataFormatVecPermuteOp<GPUDevice, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("DataFormatVecPermute")                        \
+                              .Device(DEVICE_GPU)                             \
+                              .HostMemory("x")                                \
+                              .HostMemory("y")                                \
+                              .Label("host")                                  \
+                              .TypeConstraint<T>("T"),                        \
+                          DataFormatVecPermuteOp<CPUDevice, T>);
 TF_CALL_int32(REGISTER_GPU_KERNEL);
 TF_CALL_int64(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
-- 
GitLab


From fd10bfb61ef6b1885c8fa2459522fa98305df703 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Wed, 18 Apr 2018 20:15:27 -0700
Subject: [PATCH 125/450] Expose
 tf.contrib.training.{prepend_from_queue_and_padded_batch_dataset}

Also its helper method "enqueue_in_queue_dataset".

PiperOrigin-RevId: 193458095
---
 tensorflow/contrib/training/BUILD       | 1 +
 tensorflow/contrib/training/__init__.py | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index 4d2bfd3e43..5de55b5f7f 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -60,6 +60,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/data",
         "//tensorflow/python/estimator:inputs_queues",
         "//third_party/py/numpy",
         "@six_archive//:six",
diff --git a/tensorflow/contrib/training/__init__.py b/tensorflow/contrib/training/__init__.py
index da2de3e421..edd71fb250 100644
--- a/tensorflow/contrib/training/__init__.py
+++ b/tensorflow/contrib/training/__init__.py
@@ -57,6 +57,8 @@ from tensorflow.contrib.training.python.training.hparam import *
 from tensorflow.contrib.training.python.training.resample import *
 from tensorflow.contrib.training.python.training.sampling_ops import *
 from tensorflow.contrib.training.python.training.sequence_queueing_state_saver import *
+from tensorflow.contrib.training.python.training.tensor_queue_dataset import enqueue_in_queue_dataset
+from tensorflow.contrib.training.python.training.tensor_queue_dataset import prepend_from_queue_and_padded_batch_dataset
 from tensorflow.contrib.training.python.training.training import add_gradients_summaries
 from tensorflow.contrib.training.python.training.training import clip_gradient_norms
 from tensorflow.contrib.training.python.training.training import clip_gradient_norms_fn
@@ -75,6 +77,7 @@ _allowed_symbols = [
     'FeedingQueueRunner', 'get_or_create_eval_step', 'StopAfterNEvalsHook',
     'SummaryAtEndHook', 'wait_for_new_checkpoint', 'add_gradients_summaries',
     'clip_gradient_norms', 'clip_gradient_norms_fn', 'create_train_op',
-    'multiply_gradients', 'train']
+    'multiply_gradients', 'enqueue_in_queue_dataset',
+    'prepend_from_queue_and_padded_batch_dataset', 'train']
 
 remove_undocumented(__name__, _allowed_symbols)
-- 
GitLab


From 1d003ee5f82d4d044323a3f162e6cfcf6d645346 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Wed, 18 Apr 2018 21:15:41 -0700
Subject: [PATCH 126/450] Initial addition of CLZ HLO

* Adds the HLO op and lowering on CPU/GPU/evaluator;
* This does not update the operation semantics;

PiperOrigin-RevId: 193461989
---
 .../xla/client/computation_builder.cc         |  5 ++++
 .../compiler/xla/client/computation_builder.h |  3 ++
 .../xla/client/xla_client/xla_builder.cc      |  4 +++
 .../xla/client/xla_client/xla_builder.h       |  3 ++
 .../compiler/xla/service/dfs_hlo_visitor.h    |  3 ++
 .../xla/service/elemental_ir_emitter.cc       |  7 +++++
 .../compiler/xla/service/hlo_evaluator.cc     | 28 +++++++++++++++++++
 .../compiler/xla/service/hlo_graph_dumper.cc  |  1 +
 .../compiler/xla/service/hlo_instruction.cc   |  6 ++++
 tensorflow/compiler/xla/service/hlo_opcode.h  |  1 +
 .../xla/service/instruction_fusion.cc         |  1 +
 .../compiler/xla/service/shape_inference.cc   |  3 ++
 .../compiler/xla/service/user_computation.cc  |  2 ++
 .../xla/tests/array_elementwise_ops_test.cc   |  9 ++++++
 .../compiler/xla/tools/parser/hlo_parser.cc   |  1 +
 tensorflow/compiler/xla/xla_data.proto        |  3 ++
 16 files changed, 80 insertions(+)

diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc
index 4d3b0ee0d6..83c7cb1744 100644
--- a/tensorflow/compiler/xla/client/computation_builder.cc
+++ b/tensorflow/compiler/xla/client/computation_builder.cc
@@ -1046,6 +1046,11 @@ ComputationDataHandle ComputationBuilder::Neg(
   return UnaryOp(UNOP_NEGATE, operand);
 }
 
+ComputationDataHandle ComputationBuilder::Clz(
+    const ComputationDataHandle& operand) {
+  return UnaryOp(UNOP_CLZ, operand);
+}
+
 ComputationDataHandle ComputationBuilder::Clamp(
     const ComputationDataHandle& min, const ComputationDataHandle& operand,
     const ComputationDataHandle& max) {
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index 019c6f3afb..9431c2c459 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -657,6 +657,9 @@ class ComputationBuilder {
   // Enqueues a negate instruction onto the computation.
   ComputationDataHandle Neg(const ComputationDataHandle& operand);
 
+  // Enqueues a count-leading-zeros instruction onto the computation.
+  ComputationDataHandle Clz(const ComputationDataHandle& operand);
+
   // Enqueues a transpose instruction onto the computation.
   ComputationDataHandle Transpose(
       const ComputationDataHandle& operand,
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index 7ccdc2ded2..1899983e44 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -1193,6 +1193,10 @@ XlaOp XlaBuilder::Sign(const XlaOp& operand) {
   return UnaryOp(HloOpcode::kSign, operand);
 }
 
+XlaOp XlaBuilder::Clz(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kClz, operand);
+}
+
 XlaOp XlaBuilder::Cos(const XlaOp& operand) {
   return UnaryOp(HloOpcode::kCos, operand);
 }
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
index 1f7c731064..5977ee4f4b 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
@@ -571,6 +571,9 @@ class XlaBuilder {
   // Enqueues a sign instruction onto the computation.
   XlaOp Sign(const XlaOp& operand);
 
+  // Enqueues a count leading zeros instruction onto the computation.
+  XlaOp Clz(const XlaOp& operand);
+
   // Enqueues a cosine instruction onto the computation.
   XlaOp Cos(const XlaOp& operand);
 
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 56723e7650..0528b07602 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -147,6 +147,9 @@ class DfsHloVisitorBase {
   virtual Status HandleLog(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
+  virtual Status HandleClz(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
   virtual Status HandleCos(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index b6a0903b0e..56e35e2604 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -293,6 +293,12 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
         return operand_value;
       }
     }
+    case HloOpcode::kClz: {
+      auto is_zero_undef = ir_builder_->getFalse();
+      return llvm_ir::EmitCallToIntrinsic(
+          llvm::Intrinsic::ctlz, {operand_value, is_zero_undef},
+          {operand_value->getType()}, ir_builder_);
+    }
     case HloOpcode::kSign: {
       bool is_signed =
           primitive_util::IsSignedIntegralType(op->shape().element_type());
@@ -1334,6 +1340,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kAbs:
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kCeil:
+    case HloOpcode::kClz:
     case HloOpcode::kConvert:
     case HloOpcode::kBitcastConvert:
     case HloOpcode::kCopy:
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 52bc2c0448..c5e3014834 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -1853,6 +1853,34 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  // Enable CLZ only for int32 and uint32.
+  template <
+      typename NativeT,
+      typename std::enable_if<
+          (std::is_floating_point<NativeT>::value ||
+           std::is_integral<NativeT>::value || is_complex_t<NativeT>::value) &&
+          !(std::is_same<NativeT, uint32>::value ||
+            std::is_same<NativeT, int32>::value)>::type* = nullptr>
+  Status HandleClz(HloInstruction* clz) {
+    return InvalidArgument("Unsupported type for Clz");
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_same<NativeT, uint32>::value ||
+                std::is_same<NativeT, int32>::value>::type* = nullptr>
+  Status HandleClz(HloInstruction* clz) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[clz],
+                        ElementWiseUnaryOp(clz, [](ElementwiseT elem_operand) {
+                          return 31 - tensorflow::Log2Floor(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  Status HandleClz(HloInstruction* clz) override {
+    return HandleClz<ElementwiseT>(clz);
+  }
+
   template <typename NativeT, typename std::enable_if<std::is_floating_point<
                                   NativeT>::value>::type* = nullptr>
   Status HandleSin(HloInstruction* sin) {
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 25702dc65e..516e14b464 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -909,6 +909,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kBitcastConvert:
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
+    case HloOpcode::kClz:
     case HloOpcode::kComplex:
     case HloOpcode::kConvert:
     case HloOpcode::kCos:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index a445380817..6303bcc59f 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -254,6 +254,7 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
     case HloOpcode::kCeil:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
+    case HloOpcode::kClz:
     case HloOpcode::kExp:
     case HloOpcode::kFloor:
     case HloOpcode::kImag:
@@ -1248,6 +1249,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kBitcast:
     case HloOpcode::kCeil:
+    case HloOpcode::kClz:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
@@ -1728,6 +1730,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kAdd:
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
+    case HloOpcode::kClz:
     case HloOpcode::kComplex:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
@@ -2659,6 +2662,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleFloor(this);
     case HloOpcode::kCeil:
       return visitor->HandleCeil(this);
+    case HloOpcode::kClz:
+      return visitor->HandleClz(this);
     case HloOpcode::kLog:
       return visitor->HandleLog(this);
     case HloOpcode::kTanh:
@@ -3000,6 +3005,7 @@ bool HloInstruction::IsElementwise() const {
     case HloOpcode::kAbs:
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kCeil:
+    case HloOpcode::kClz:
     case HloOpcode::kConvert:
     case HloOpcode::kBitcastConvert:
     case HloOpcode::kCopy:
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index af24604c39..ca763076a1 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -57,6 +57,7 @@ namespace xla {
   V(kCall, "call", kHloOpcodeIsVariadic)                     \
   V(kCeil, "ceil")                                           \
   V(kClamp, "clamp")                                         \
+  V(kClz, "count-leading-zeros")                             \
   V(kComplex, "complex")                                     \
   V(kConcatenate, "concatenate", kHloOpcodeIsVariadic)       \
   V(kConditional, "conditional")                             \
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index d69ad80bdb..b9ccfeddb5 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -39,6 +39,7 @@ namespace xla {
     case HloOpcode::kBroadcast:
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
+    case HloOpcode::kClz:
     case HloOpcode::kComplex:
     case HloOpcode::kConcatenate:
     case HloOpcode::kConstant:
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 77e12d3602..48b2922e77 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -52,6 +52,8 @@ UnaryOperation OpcodeToUnaryOperation(HloOpcode opcode) {
       return UNOP_ABS;
     case HloOpcode::kCeil:
       return UNOP_CEIL;
+    case HloOpcode::kClz:
+      return UNOP_CLZ;
     case HloOpcode::kCos:
       return UNOP_COS;
     case HloOpcode::kExp:
@@ -360,6 +362,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
             arg, primitive_util::ComplexComponentType(arg.element_type()));
       }
       return arg;
+    case UNOP_CLZ:
     case UNOP_NEGATE:
     case UNOP_ROUND_NEAREST_AFZ:
     case UNOP_SIGN:
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index 532f7fd5bf..0f16a592b6 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -49,6 +49,8 @@ HloOpcode UnaryOperationToHloOpcode(UnaryOperation unop) {
       return HloOpcode::kAbs;
     case UNOP_CEIL:
       return HloOpcode::kCeil;
+    case UNOP_CLZ:
+      return HloOpcode::kClz;
     case UNOP_COS:
       return HloOpcode::kCos;
     case UNOP_EXP:
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 03c91745b9..4b4dc6dd9d 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -2217,6 +2217,15 @@ XLA_TEST_F(ArrayElementwiseOpTest, LogF32sVector) {
                              error_spec_);
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, ClzU32s) {
+  XlaBuilder builder(TestName());
+  auto a = builder.ConstantR1<uint32>(
+      {0, 1, 0x10, 0x10000, 0x700000, 0x12345678, 0xF2345678});
+  builder.Clz(a);
+
+  ComputeAndCompareR1<uint32>(&builder, {32, 31, 27, 15, 9, 3, 0}, {});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, AddChainFoldLeft) {
   // a ------ (add) --------- (add)
   //         /               /
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index e60a5a4919..95d3fd28b3 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -470,6 +470,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kBitcast:
     case HloOpcode::kCeil:
+    case HloOpcode::kClz:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index f18d53c608..d23f9e5918 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -801,6 +801,9 @@ enum UnaryOperation {
 
   // Elementwise, extract real component of complex x.
   UNOP_IMAG = 16;
+
+  // Elementwise, computes clz(x).
+  UNOP_CLZ = 17;
 }
 
 message UnaryOpRequest {
-- 
GitLab


From ee1676d4dbded64e192aecfa693ab605e24c9929 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Wed, 18 Apr 2018 22:07:12 -0700
Subject: [PATCH 127/450] [XLA] Fix BF16 propagation bug for while condition.

PiperOrigin-RevId: 193465140
---
 .../xla/service/bfloat16_propagation.cc       |  1 -
 .../xla/service/bfloat16_propagation_test.cc  | 58 ++++++++++++++++++-
 2 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.cc b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
index c26d2feef5..43ebe92c5e 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
@@ -392,7 +392,6 @@ void BFloat16Propagation::AdjustCalledComputationRoot(HloInstruction* hlo) {
       adjust_computation(hlo->fused_instructions_computation(), hlo->shape());
       break;
     case HloOpcode::kWhile:
-      adjust_computation(hlo->while_condition(), hlo->shape());
       adjust_computation(hlo->while_body(), hlo->shape());
       break;
     default:
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
index 88f8301416..183db1652e 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
@@ -426,8 +426,62 @@ TEST_F(BFloat16PropagationTest, SelectOverTuples) {
   EXPECT_TRUE(OutputsBF16(xpose));
 }
 
-// Tests that BF16 is propagated properly through while computations.
-TEST_F(BFloat16PropagationTest, PropagateThroughWhile) {
+// Tests that BF16 is propagated properly through a while computation with
+// non-tuple input/output.
+TEST_F(BFloat16PropagationTest, PropagateThroughSimpleWhile) {
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
+
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, shape, "param1"));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
+
+  auto builder_cond = HloComputation::Builder("cond");
+  auto cond_param = builder_cond.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "cond_param"));
+  auto cond_dot = builder_cond.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kDot, cond_param, cond_param));
+  auto cond_root = builder_cond.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt,
+      builder_cond.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {}), cond_dot, {0, 0}, {1, 1}, {1, 1})),
+      builder_cond.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {}), cond_dot, {1, 1}, {2, 2}, {1, 1}))));
+  auto cond = module->AddEmbeddedComputation(builder_cond.Build());
+
+  auto builder_body = HloComputation::Builder("body");
+  auto body_param = builder_body.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "body_param"));
+  auto body_dot = builder_body.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kDot, body_param, body_param));
+  auto body = module->AddEmbeddedComputation(builder_body.Build());
+
+  auto while_hlo = builder.AddInstruction(
+      HloInstruction::CreateWhile(shape, cond, body, add));
+
+  auto dot = builder.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kDot, while_hlo, while_hlo));
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(PropagatePrecision(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), dot);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(cond_root->shape(), ShapeUtil::MakeShape(PRED, {})));
+  EXPECT_TRUE(OutputsBF16(add));
+  EXPECT_TRUE(OutputsBF16(body_dot));
+  EXPECT_TRUE(OutputsBF16(body_param));
+  EXPECT_TRUE(OutputsBF16(cond_param));
+  EXPECT_FALSE(OutputsBF16(dot));
+}
+
+// Tests that BF16 is propagated properly through while computations with
+// tuple-shaped input/output.
+TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) {
   auto module = CreateNewModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
-- 
GitLab


From 2a6c5998a239f41926ca295ac20bb595862fd5ff Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 18 Apr 2018 22:59:01 -0700
Subject: [PATCH 128/450] [tf.data] Add native implementation for
 `tf.contrib.data.unbatch()`.

The implementation has two main improvements:
1. Avoid relatively expensive (~15us) function invocation for each incoming batch.
2. Use std::move() where possible to avoid copying strings/variants into the unbatched
   elements.

PiperOrigin-RevId: 193467856
---
 .../kernel_tests/batch_dataset_op_test.py     | 228 +++++++++++++++++-
 .../contrib/data/python/ops/batching.py       | 135 ++++++++---
 tensorflow/contrib/tpu/python/tpu/datasets.py |   2 +-
 .../base_api/api_def_UnbatchDataset.pbtxt     |   4 +
 .../python_api/api_def_UnbatchDataset.pbtxt   |   4 +
 tensorflow/core/framework/tensor.h            |   5 +
 tensorflow/core/kernels/batch_util.cc         |  73 +++++-
 tensorflow/core/kernels/batch_util.h          |   6 +
 tensorflow/core/kernels/data/BUILD            |  14 ++
 .../core/kernels/data/unbatch_dataset_op.cc   | 204 ++++++++++++++++
 tensorflow/core/ops/dataset_ops.cc            |   7 +
 11 files changed, 635 insertions(+), 47 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt
 create mode 100644 tensorflow/core/kernels/data/unbatch_dataset_op.cc

diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index 413d873797..e1ec60d7c9 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -18,15 +18,18 @@ from __future__ import division
 from __future__ import print_function
 
 import math
+import time
 
 import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import batching
+from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
@@ -34,6 +37,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat
 
 
 class BatchDatasetTest(test.TestCase):
@@ -151,6 +155,69 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(op)
 
+  def testUnbatchDatasetWithStrings(self):
+    data = tuple([math_ops.range(10) for _ in range(3)])
+    data = dataset_ops.Dataset.from_tensor_slices(data)
+    data = data.map(lambda x, y, z: (x, string_ops.as_string(y), z))
+    expected_types = (dtypes.int32, dtypes.string, dtypes.int32)
+    data = data.batch(2)
+    self.assertEqual(expected_types, data.output_types)
+    data = data.apply(batching.unbatch())
+    self.assertEqual(expected_types, data.output_types)
+
+    iterator = data.make_one_shot_iterator()
+    op = iterator.get_next()
+
+    with self.test_session() as sess:
+      for i in range(10):
+        self.assertEqual((i, compat.as_bytes(str(i)), i), sess.run(op))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(op)
+
+  def testUnbatchDatasetWithSparseTensor(self):
+    st = sparse_tensor.SparseTensorValue(
+        indices=[[i, i] for i in range(10)],
+        values=list(range(10)),
+        dense_shape=[10, 10])
+    data = dataset_ops.Dataset.from_tensors(st)
+    data = data.apply(batching.unbatch())
+    data = data.batch(5)
+    data = data.apply(batching.unbatch())
+    iterator = data.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      for i in range(10):
+        st_row = sess.run(next_element)
+        self.assertEqual([i], st_row.indices)
+        self.assertEqual([i], st_row.values)
+        self.assertEqual([10], st_row.dense_shape)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testUnbatchDatasetWithDenseAndSparseTensor(self):
+    st = sparse_tensor.SparseTensorValue(
+        indices=[[i, i] for i in range(10)],
+        values=list(range(10)),
+        dense_shape=[10, 10])
+    data = dataset_ops.Dataset.from_tensors((list(range(10)), st))
+    data = data.apply(batching.unbatch())
+    data = data.batch(5)
+    data = data.apply(batching.unbatch())
+    iterator = data.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      for i in range(10):
+        dense_elem, st_row = sess.run(next_element)
+        self.assertEqual(i, dense_elem)
+        self.assertEqual([i], st_row.indices)
+        self.assertEqual([i], st_row.values)
+        self.assertEqual([10], st_row.dense_shape)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
   def testUnbatchSingleElementTupleDataset(self):
     data = tuple([(math_ops.range(10),) for _ in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
@@ -191,6 +258,53 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(op)
 
+  def testUnbatchEmpty(self):
+    data = dataset_ops.Dataset.from_tensors(
+        (constant_op.constant([]), constant_op.constant([], shape=[0, 4]),
+         constant_op.constant([], shape=[0, 4, 0])))
+    data = data.apply(batching.unbatch())
+    iterator = data.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testUnbatchStaticShapeMismatch(self):
+    data = dataset_ops.Dataset.from_tensors((np.arange(7), np.arange(8),
+                                             np.arange(9)))
+    with self.assertRaises(ValueError):
+      data.apply(batching.unbatch())
+
+  def testUnbatchDynamicShapeMismatch(self):
+    ph1 = array_ops.placeholder(dtypes.int32, shape=[None])
+    ph2 = array_ops.placeholder(dtypes.int32, shape=None)
+    data = dataset_ops.Dataset.from_tensors((ph1, ph2))
+    data = data.apply(batching.unbatch())
+    iterator = data.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      # Mismatch in the 0th dimension.
+      sess.run(
+          iterator.initializer,
+          feed_dict={
+              ph1: np.arange(7).astype(np.int32),
+              ph2: np.arange(8).astype(np.int32)
+          })
+      with self.assertRaises(errors.InvalidArgumentError):
+        print(sess.run(next_element))
+
+      # No 0th dimension (i.e. scalar value) for one component.
+      sess.run(
+          iterator.initializer,
+          feed_dict={
+              ph1: np.arange(7).astype(np.int32),
+              ph2: 7
+          })
+      with self.assertRaises(errors.InvalidArgumentError):
+        print(sess.run(next_element))
+
   def testBatchAndDropRemainder(self):
     components = (np.arange(7),
                   np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
@@ -545,6 +659,28 @@ class BatchDatasetSerializationTest(
     self.run_core_tests(self._build_dataset_nested_sparse, None, 1)
 
 
+class UnbatchDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def build_dataset(self, multiplier=15.0, tensor_slice_len=2, batch_size=2):
+    components = (
+        np.arange(tensor_slice_len),
+        np.array([[1, 2, 3]]) * np.arange(tensor_slice_len)[:, np.newaxis],
+        np.array(multiplier) * np.arange(tensor_slice_len))
+
+    return dataset_ops.Dataset.from_tensor_slices(components).batch(
+        batch_size).apply(batching.unbatch())
+
+  def testCore(self):
+    tensor_slice_len = 8
+    batch_size = 2
+    num_outputs = tensor_slice_len
+    self.run_core_tests(
+        lambda: self.build_dataset(15.0, tensor_slice_len, batch_size),
+        lambda: self.build_dataset(20.0, tensor_slice_len, batch_size),
+        num_outputs)
+
+
 class PaddedBatchDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
@@ -586,10 +722,12 @@ class RestructuredDatasetTest(test.TestCase):
   def test_assert_element_shape(self):
 
     def create_unknown_shape_dataset(x):
-      return script_ops.py_func(lambda _: (np.ones(2, dtype=np.float32),
-                                           np.zeros((3, 4), dtype=np.int32)),
-                                [x],
-                                [dtypes.float32, dtypes.int32])
+      return script_ops.py_func(
+          lambda _: (  # pylint: disable=g-long-lambda
+              np.ones(2, dtype=np.float32),
+              np.zeros((3, 4), dtype=np.int32)),
+          [x],
+          [dtypes.float32, dtypes.int32])
 
     dataset = dataset_ops.Dataset.range(5).map(create_unknown_shape_dataset)
     unknown_shapes = (tensor_shape.TensorShape(None),
@@ -626,10 +764,12 @@ class RestructuredDatasetTest(test.TestCase):
   def test_assert_wrong_element_shape_on_unknown_shape_dataset(self):
 
     def create_unknown_shape_dataset(x):
-      return script_ops.py_func(lambda _: (np.ones(2, dtype=np.float32),
-                                           np.zeros((3, 4), dtype=np.int32)),
-                                [x],
-                                [dtypes.float32, dtypes.int32])
+      return script_ops.py_func(
+          lambda _: (  # pylint: disable=g-long-lambda
+              np.ones(2, dtype=np.float32),
+              np.zeros((3, 4), dtype=np.int32)),
+          [x],
+          [dtypes.float32, dtypes.int32])
 
     dataset = dataset_ops.Dataset.range(3).map(create_unknown_shape_dataset)
     unknown_shapes = (tensor_shape.TensorShape(None),
@@ -649,5 +789,77 @@ class RestructuredDatasetTest(test.TestCase):
         sess.run(get_next)
 
 
+class UnbatchDatasetBenchmark(test.Benchmark):
+
+  def benchmarkNativeUnbatch(self):
+    batch_sizes = [1, 2, 5, 10, 20, 50]
+    elems_per_trial = 10000
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
+      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+      dataset = dataset.batch(batch_size_placeholder)
+      dataset = dataset.apply(batching.unbatch())
+      dataset = dataset.skip(elems_per_trial)
+      iterator = dataset.make_initializable_iterator()
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for batch_size in batch_sizes:
+          deltas = []
+          for _ in range(5):
+            sess.run(
+                iterator.initializer,
+                feed_dict={batch_size_placeholder: batch_size})
+            start = time.time()
+            sess.run(next_element.op)
+            end = time.time()
+            deltas.append((end - start) / elems_per_trial)
+
+          median_wall_time = np.median(deltas)
+          print("Unbatch (native) batch size: %d Median wall time per element:"
+                " %f microseconds" % (batch_size, median_wall_time * 1e6))
+          self.report_benchmark(
+              iters=10000,
+              wall_time=median_wall_time,
+              name="benchmark_unbatch_dataset_native_batch_size_%d" %
+              batch_size)
+
+  # Include a benchmark of the previous `unbatch()` implementation that uses
+  # a composition of more primitive ops. Eventually we'd hope to generate code
+  # that is as good in both cases.
+  def benchmarkOldUnbatchImplementation(self):
+    batch_sizes = [1, 2, 5, 10, 20, 50]
+    elems_per_trial = 10000
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
+      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+      dataset = dataset.batch(batch_size_placeholder)
+      dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices)
+      dataset = dataset.skip(elems_per_trial)
+      iterator = dataset.make_initializable_iterator()
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for batch_size in batch_sizes:
+          deltas = []
+          for _ in range(5):
+            sess.run(
+                iterator.initializer,
+                feed_dict={batch_size_placeholder: batch_size})
+            start = time.time()
+            sess.run(next_element.op)
+            end = time.time()
+            deltas.append((end - start) / elems_per_trial)
+
+          median_wall_time = np.median(deltas)
+          print("Unbatch (unfused) batch size: %d Median wall time per element:"
+                " %f microseconds" % (batch_size, median_wall_time * 1e6))
+          self.report_benchmark(
+              iters=10000,
+              wall_time=median_wall_time,
+              name="benchmark_unbatch_dataset_unfused_batch_size_%d" %
+              batch_size)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index 28db949da9..2152bcde84 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -80,28 +80,98 @@ def dense_to_sparse_batch(batch_size, row_shape):
   return _apply_fn
 
 
+class UnbatchDataset(dataset_ops.Dataset):
+  """A dataset that splits the elements of its input into multiple elements."""
+
+  def __init__(self, input_dataset):
+    """See `unbatch()` for more details."""
+    super(UnbatchDataset, self).__init__()
+    flat_shapes = nest.flatten(input_dataset.output_shapes)
+    if any(s.ndims == 0 for s in flat_shapes):
+      raise ValueError("Cannot unbatch an input with scalar components.")
+    known_batch_dim = tensor_shape.Dimension(None)
+    for s in flat_shapes:
+      try:
+        known_batch_dim = known_batch_dim.merge_with(s[0])
+      except ValueError:
+        raise ValueError("Cannot unbatch an input whose components have "
+                         "different batch sizes.")
+    self._input_dataset = input_dataset
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.unbatch_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return nest.map_structure(lambda s: s[1:],
+                              self._input_dataset.output_shapes)
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
 def unbatch():
-  """A Transformation which splits the elements of a dataset.
+  """Splits elements of a dataset into multiple elements on the batch dimension.
 
   For example, if elements of the dataset are shaped `[B, a0, a1, ...]`,
-  where `B` may vary from element to element, then for each element in
-  the dataset, the unbatched dataset will contain `B` consecutive elements
+  where `B` may vary for each input element, then for each element in the
+  dataset, the unbatched dataset will contain `B` consecutive elements
   of shape `[a0, a1, ...]`.
 
+  ```python
+  # NOTE: The following example uses `{ ... }` to represent the contents
+  # of a dataset.
+  a = { ['a', 'b', 'c'], ['a', 'b'], ['a', 'b', 'c', 'd'] }
+
+  a.apply(tf.contrib.data.unbatch()) == {
+      'a', 'b', 'c', 'a', 'b', 'a', 'b', 'c', 'd'}
+  ```
+
   Returns:
     A `Dataset` transformation function, which can be passed to
     @{tf.data.Dataset.apply}.
   """
 
   def _apply_fn(dataset):
-
-    def unbatch_map(arg, *rest):
+    """Function from `Dataset` to `Dataset` that applies the transformation."""
+    if not sparse.any_sparse(dataset.output_classes):
+      return UnbatchDataset(dataset)
+
+    # NOTE(mrry): We must ensure that any SparseTensors in `dataset`
+    # are normalized to the rank-1 dense representation, so that the
+    # sparse-oblivious unbatching logic will slice them
+    # appropriately. This leads to a somewhat inefficient re-encoding step
+    # for all SparseTensor components.
+    # TODO(mrry): Consider optimizing this in future
+    # if it turns out to be a bottleneck.
+    def normalize(arg, *rest):
       if rest:
-        return dataset_ops.Dataset.from_tensor_slices((arg,) + rest)
+        return sparse.serialize_many_sparse_tensors((arg,) + rest)
       else:
-        return dataset_ops.Dataset.from_tensor_slices(arg)
+        return sparse.serialize_many_sparse_tensors(arg)
+
+    normalized_dataset = dataset.map(normalize)
 
-    return dataset.flat_map(map_func=unbatch_map)
+    # NOTE(mrry): Our `map()` has lost information about the sparseness
+    # of any SparseTensor components, so re-apply the structure of the
+    # original dataset.
+    restructured_dataset = _RestructuredDataset(
+        normalized_dataset,
+        dataset.output_types,
+        dataset.output_shapes,
+        dataset.output_classes,
+        allow_unsafe_cast=True)
+    return UnbatchDataset(restructured_dataset)
 
   return _apply_fn
 
@@ -265,7 +335,8 @@ class _RestructuredDataset(dataset_ops.Dataset):
                dataset,
                output_types,
                output_shapes=None,
-               output_classes=None):
+               output_classes=None,
+               allow_unsafe_cast=False):
     """Creates a new dataset with the given output types and shapes.
 
     The given `dataset` must have a structure that is convertible:
@@ -283,6 +354,10 @@ class _RestructuredDataset(dataset_ops.Dataset):
         If omitted, the shapes will be inherited from `dataset`.
       output_classes: (Optional.) A nested structure of class types.
         If omitted, the class types will be inherited from `dataset`.
+      allow_unsafe_cast: (Optional.) If `True`, the caller may switch the
+        reported output types and shapes of the restructured dataset, e.g. to
+        switch a sparse tensor represented as `tf.variant` to its user-visible
+        type and shape.
 
     Raises:
       ValueError: If either `output_types` or `output_shapes` is not compatible
@@ -291,14 +366,15 @@ class _RestructuredDataset(dataset_ops.Dataset):
     super(_RestructuredDataset, self).__init__()
     self._dataset = dataset
 
-    # Validate that the types are compatible.
-    output_types = nest.map_structure(dtypes.as_dtype, output_types)
-    flat_original_types = nest.flatten(dataset.output_types)
-    flat_new_types = nest.flatten(output_types)
-    if flat_original_types != flat_new_types:
-      raise ValueError(
-          "Dataset with output types %r cannot be restructured to have output "
-          "types %r" % (dataset.output_types, output_types))
+    if not allow_unsafe_cast:
+      # Validate that the types are compatible.
+      output_types = nest.map_structure(dtypes.as_dtype, output_types)
+      flat_original_types = nest.flatten(dataset.output_types)
+      flat_new_types = nest.flatten(output_types)
+      if flat_original_types != flat_new_types:
+        raise ValueError(
+            "Dataset with output types %r cannot be restructured to have "
+            "output types %r" % (dataset.output_types, output_types))
 
     self._output_types = output_types
 
@@ -308,18 +384,19 @@ class _RestructuredDataset(dataset_ops.Dataset):
                                                   nest.flatten(
                                                       dataset.output_shapes))
     else:
-      # Validate that the shapes are compatible.
-      nest.assert_same_structure(output_types, output_shapes)
-      flat_original_shapes = nest.flatten(dataset.output_shapes)
-      flat_new_shapes = nest.flatten_up_to(output_types, output_shapes)
-
-      for original_shape, new_shape in zip(flat_original_shapes,
-                                           flat_new_shapes):
-        if not original_shape.is_compatible_with(new_shape):
-          raise ValueError(
-              "Dataset with output shapes %r cannot be restructured to have "
-              "incompatible output shapes %r" % (dataset.output_shapes,
-                                                 output_shapes))
+      if not allow_unsafe_cast:
+        # Validate that the shapes are compatible.
+        nest.assert_same_structure(output_types, output_shapes)
+        flat_original_shapes = nest.flatten(dataset.output_shapes)
+        flat_new_shapes = nest.flatten_up_to(output_types, output_shapes)
+
+        for original_shape, new_shape in zip(flat_original_shapes,
+                                             flat_new_shapes):
+          if not original_shape.is_compatible_with(new_shape):
+            raise ValueError(
+                "Dataset with output shapes %r cannot be restructured to have "
+                "incompatible output shapes %r" % (dataset.output_shapes,
+                                                   output_shapes))
       self._output_shapes = nest.map_structure_up_to(
           output_types, tensor_shape.as_shape, output_shapes)
     if output_classes is None:
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
index 465c668fd8..2e472a2805 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -170,7 +170,7 @@ def StreamingFilesDataset(files,
         args=[source_handle],
         Tout=[dtypes.string],
         f=LoadingFunc,
-        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)
+        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)[0]
 
   with ops.device('/job:%s' % worker_job):
     output_dataset = dataset_ops.Dataset.range(2).repeat().map(
diff --git a/tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt
new file mode 100644
index 0000000000..324fadac0a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UnbatchDataset"
+  summary: "A dataset that splits the elements of its input into multiple elements."
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt
new file mode 100644
index 0000000000..1e5415749f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UnbatchDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 4d10f7efb5..58fbced606 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -44,6 +44,7 @@ class TensorProto;
 class VariantTensorData;
 namespace batch_util {
 Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
+Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index);
 }  // namespace batch_util
 
 /// @ingroup core
@@ -493,6 +494,10 @@ class Tensor {
   friend Status batch_util::CopyElementToSlice(
       Tensor element, Tensor* parent,
       int64 index);                // For access to RefCountIsOne().
+  friend Status batch_util::MaybeMoveSliceToElement(
+      Tensor* parent, Tensor* element,
+      int64 index);  // For access to RefCountIsOne().
+
   friend class NumpyTensorBuffer;  // For access to the private constructor
                                    // taking the buffer.
 
diff --git a/tensorflow/core/kernels/batch_util.cc b/tensorflow/core/kernels/batch_util.cc
index 1a45212ad2..52be1ab8d0 100644
--- a/tensorflow/core/kernels/batch_util.cc
+++ b/tensorflow/core/kernels/batch_util.cc
@@ -78,14 +78,44 @@ Status HandleElementToSlice<Variant>(Tensor element, Tensor* parent,
   return Status::OK();
 }
 
-// TODO(jsimsa): Add HandleElementToSlice<variant> specialization that moves
-// the data when possible.
-
+// TODO(b/78245576): Consider removing this overload.
 template <typename T>
-static Status HandleSliceToElement(const Tensor& parent, Tensor* element,
-                                   int64 index) {
+void HandleSliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   element->flat<T>() = parent.flat_outer_dims<T>().chip(index, 0);
-  return Status::OK();
+}
+
+template <typename T>
+void HandleSliceToElement(Tensor* parent, Tensor* element, int64 index,
+                          bool can_move) {
+  element->flat<T>() = parent->flat_outer_dims<T>().chip(index, 0);
+}
+
+template <>
+void HandleSliceToElement<string>(Tensor* parent, Tensor* element, int64 index,
+                                  bool can_move) {
+  auto parent_as_matrix = parent->flat_outer_dims<string>();
+  auto element_flat = element->flat<string>();
+  if (can_move) {
+    for (int64 i = 0; i < element->NumElements(); ++i) {
+      element_flat(i) = std::move(parent_as_matrix(index, i));
+    }
+  } else {
+    element_flat = parent_as_matrix.chip(index, 0);
+  }
+}
+
+template <>
+void HandleSliceToElement<Variant>(Tensor* parent, Tensor* element, int64 index,
+                                   bool can_move) {
+  auto parent_as_matrix = parent->flat_outer_dims<Variant>();
+  auto element_flat = element->flat<Variant>();
+  if (can_move) {
+    for (int64 i = 0; i < element->NumElements(); ++i) {
+      element_flat(i) = std::move(parent_as_matrix(index, i));
+    }
+  } else {
+    element_flat = parent_as_matrix.chip(index, 0);
+  }
 }
 
 }  // namespace
@@ -115,9 +145,10 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index) {
 Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   TF_RETURN_IF_ERROR(ValidateInput(parent, *element, index));
 
-#define HANDLE_TYPE(T)                                      \
-  case DataTypeToEnum<T>::value: {                          \
-    return HandleSliceToElement<T>(parent, element, index); \
+#define HANDLE_TYPE(T)                               \
+  case DataTypeToEnum<T>::value: {                   \
+    HandleSliceToElement<T>(parent, element, index); \
+    return Status::OK();                             \
   }
 
   switch (parent.dtype()) {
@@ -130,6 +161,30 @@ Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   }
 }
 
+// Copies the index^th slice of parent (in the 0th dimension) into element.
+//
+// NOTE(mrry): The implementation may be able to optimize the copy to a move.
+// This is particularly important for DT_STRING tensors.
+Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index) {
+  TF_RETURN_IF_ERROR(ValidateInput(*parent, *element, index));
+  bool can_move = parent->RefCountIsOne();
+
+#define HANDLE_TYPE(T)                                         \
+  case DataTypeToEnum<T>::value: {                             \
+    HandleSliceToElement<T>(parent, element, index, can_move); \
+    return Status::OK();                                       \
+  }
+
+  switch (parent->dtype()) {
+    TF_CALL_ALL_TYPES(HANDLE_TYPE);
+    TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+#undef HANDLE_TYPE
+    default:
+      return errors::Unimplemented(
+          "MaybeMoveSliceToElement Unhandled data type: ", element->dtype());
+  }
+}
+
 // The following five functions are copied from padding_fifo_queue.cc.
 // TODO(mrry): Reconcile these functions with the similar methods in the
 // queue implementation.
diff --git a/tensorflow/core/kernels/batch_util.h b/tensorflow/core/kernels/batch_util.h
index a47bf1935d..69098fbd1d 100644
--- a/tensorflow/core/kernels/batch_util.h
+++ b/tensorflow/core/kernels/batch_util.h
@@ -32,6 +32,12 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
 // Copies the index^th slice of parent (in the 0th dimension) into element.
 Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index);
 
+// Copies the index^th slice of parent (in the 0th dimension) into element.
+//
+// NOTE(mrry): The implementation may be able to optimize the copy to a move.
+// This is particularly important for DT_STRING tensors.
+Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index);
+
 // Zero-initializes the tensor `element` using the scalar stored in `padding`.
 // Both `element` and `padding` must have matching `dtype`.
 Status SetElementZero(Tensor* element, const Tensor& padding);
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 221724e25d..1e96eb6421 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -446,6 +446,19 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "unbatch_dataset_op",
+    srcs = ["unbatch_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:batch_util",
+    ],
+)
+
 tf_kernel_library(
     name = "zip_dataset_op",
     srcs = ["zip_dataset_op.cc"],
@@ -562,6 +575,7 @@ tf_kernel_library(
         ":tensor_dataset_op",
         ":tensor_queue_dataset_op",
         ":tensor_slice_dataset_op",
+        ":unbatch_dataset_op",
         ":zip_dataset_op",
     ],
 )
diff --git a/tensorflow/core/kernels/data/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
new file mode 100644
index 0000000000..241b615aca
--- /dev/null
+++ b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
@@ -0,0 +1,204 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/batch_util.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class UnbatchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit UnbatchDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    *output = new Dataset(ctx, input);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    explicit Dataset(OpKernelContext* ctx, DatasetBase* input)
+        : GraphDatasetBase(ctx), input_(input) {
+      input_->Ref();
+      for (const PartialTensorShape& shape : input->output_shapes()) {
+        gtl::InlinedVector<int64, 4> partial_dim_sizes;
+        for (int i = 1; i < shape.dims(); ++i) {
+          partial_dim_sizes.push_back(shape.dim_size(i));
+        }
+        shapes_.emplace_back(std::move(partial_dim_sizes));
+      }
+    }
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Unbatch")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return shapes_;
+    }
+
+    string DebugString() override { return "UnbatchDatasetOp::Dataset"; }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            current_index_(0),
+            current_batch_size_(0),
+            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
+            shapes_(params.dataset->output_shapes().size()) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if (!input_impl_) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+        *end_of_sequence = false;
+        while (!*end_of_sequence) {
+          if (current_index_ < current_batch_size_) {
+            out_tensors->clear();
+            out_tensors->reserve(tensors_.size());
+            for (int i = 0; i < tensors_.size(); ++i) {
+              out_tensors->emplace_back(ctx->allocator({}), tensors_[i].dtype(),
+                                        shapes_[i]);
+              TF_RETURN_IF_ERROR(batch_util::MaybeMoveSliceToElement(
+                  &tensors_[i], &out_tensors->back(), current_index_));
+            }
+            ++current_index_;
+            *end_of_sequence = false;
+            return Status::OK();
+          }
+          current_index_ = 0;
+          current_batch_size_ = 0;
+          tensors_.clear();
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, &tensors_, end_of_sequence));
+          if (!*end_of_sequence) {
+            for (size_t i = 0; i < tensors_.size(); ++i) {
+              if (tensors_[i].dims() == 0) {
+                return errors::InvalidArgument(
+                    "Input element must have a non-scalar value in each "
+                    "component.");
+              }
+              if (tensors_[i].dim_size(0) != tensors_[0].dim_size(0)) {
+                return errors::InvalidArgument(
+                    "Input element must have the same batch size in each "
+                    "component. Component 0 had size ",
+                    tensors_[0].dim_size(0), " but component ", i,
+                    " had size, ", tensors_[i].dim_size(0), ".");
+              }
+              shapes_[i] = tensors_[i].shape();
+              shapes_[i].RemoveDim(0);
+            }
+            current_batch_size_ = tensors_[0].dim_size(0);
+          }
+        }
+        input_impl_.reset();
+        return Status::OK();
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impl_empty"), ""));
+        }
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("current_index"), current_index_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("n"), current_batch_size_));
+        if (current_index_ < current_batch_size_) {
+          for (size_t i = 0; i < tensors_.size(); ++i) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat("tensors[", i, "]")), tensors_[i]));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (!reader->Contains(full_name("input_impl_empty"))) {
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        } else {
+          input_impl_.reset();
+        }
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("current_index"), &current_index_));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("n"), &current_batch_size_));
+        tensors_.clear();
+        tensors_.resize(dataset()->output_dtypes().size());
+        if (current_index_ < current_batch_size_) {
+          for (size_t i = 0; i < tensors_.size(); ++i) {
+            TF_RETURN_IF_ERROR(reader->ReadTensor(
+                full_name(strings::StrCat("tensors[", i, "]")), &tensors_[i]));
+            shapes_[i] = tensors_[i].shape();
+            shapes_[i].RemoveDim(0);
+          }
+        }
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 current_index_ GUARDED_BY(mu_);
+      int64 current_batch_size_ GUARDED_BY(mu_);
+      std::vector<Tensor> tensors_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::vector<TensorShape> shapes_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+    std::vector<PartialTensorShape> shapes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("UnbatchDataset").Device(DEVICE_CPU),
+                        UnbatchDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 57f871af32..8be569b315 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -83,6 +83,13 @@ REGISTER_OP("GeneratorDataset")
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("UnbatchDataset")
+    .Input("input_dataset: variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ZipDataset")
     .Input("input_datasets: N * variant")
     .Output("handle: variant")
-- 
GitLab


From 2294834612cde9781e37021af7ba8480aadbb112 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 23:18:50 -0700
Subject: [PATCH 129/450] Update ops-related pbtxt files.

PiperOrigin-RevId: 193469437
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 23 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 23 +++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 42a67bc4c8..9bc11cf0fe 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -70008,6 +70008,29 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UnbatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "UnbatchGrad"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 980e560601..9b665190ce 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -32826,6 +32826,29 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UnbatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "UnbatchGrad"
   input_arg {
-- 
GitLab


From 2024f37f78e04ed1d035f53d2c3804bfb12e690f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 23:46:34 -0700
Subject: [PATCH 130/450] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 193471104

---
 tensorflow/go/op/wrappers.go | 570 +++++++++++++++++------------------
 1 file changed, 285 insertions(+), 285 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index f270eadc32..35ad1eff0f 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -22806,49 +22806,6 @@ func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Out
 	return op.Output(0), op.Output(1)
 }
 
-//     Adds v into specified rows of x.
-//
-//     Computes y = x; y[i, :] += v; return y.
-//
-// Arguments:
-//	x: A `Tensor` of type T.
-//	i: A vector. Indices into the left-most dimension of `x`.
-//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
-//
-// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
-func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InplaceAdd",
-		Input: []tf.Input{
-			x, i, v,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Restore a Reader to its initial clean state.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//
-// Returns the created operation.
-func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderResetV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
 type ResourceApplyAdamAttr func(optionalAttr)
 
@@ -24195,21 +24152,299 @@ func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output,
 	return op.Output(0)
 }
 
-// Returns x / y element-wise for real types.
+// Returns x / y element-wise for real types.
+//
+// If `x` and `y` are reals, this will return the floating-point division.
+//
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RealDiv",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+//     Adds v into specified rows of x.
+//
+//     Computes y = x; y[i, :] += v; return y.
+//
+// Arguments:
+//	x: A `Tensor` of type T.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InplaceAdd",
+		Input: []tf.Input{
+			x, i, v,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Restore a Reader to its initial clean state.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//
+// Returns the created operation.
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderResetV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RpcAttr is an optional argument to Rpc.
+type RpcAttr func(optionalAttr)
+
+// RpcProtocol sets the optional protocol attribute to value.
+//
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func RpcProtocol(value string) RpcAttr {
+	return func(m optionalAttr) {
+		m["protocol"] = value
+	}
+}
+
+// RpcFailFast sets the optional fail_fast attribute to value.
+//
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func RpcFailFast(value bool) RpcAttr {
+	return func(m optionalAttr) {
+		m["fail_fast"] = value
+	}
+}
+
+// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+//
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func RpcTimeoutInMs(value int64) RpcAttr {
+	return func(m optionalAttr) {
+		m["timeout_in_ms"] = value
+	}
+}
+
+// Perform batches of RPC requests.
+//
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
+//
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the RPC method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
+//
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
+//
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
+//
+// then call this op with arguments:
+//
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
+//
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// If the connection fails or the remote worker returns an error
+// status, the op reraises this exception locally.
+//
+// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
+//
+// Arguments:
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
+//
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
+func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Rpc",
+		Input: []tf.Input{
+			address, method, request,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OrderedMapStageAttr is an optional argument to OrderedMapStage.
+type OrderedMapStageAttr func(optionalAttr)
+
+// OrderedMapStageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func OrderedMapStageContainer(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapStageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a ordered
+//
+// associative container.   Elements are ordered by key.
+//
+// Arguments:
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapStage",
+		Input: []tf.Input{
+			key, indices, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StackPushV2Attr is an optional argument to StackPushV2.
+type StackPushV2Attr func(optionalAttr)
+
+// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
+//
+// value: Swap `elem` to CPU. Default to false.
+// If not specified, defaults to false
+func StackPushV2SwapMemory(value bool) StackPushV2Attr {
+	return func(m optionalAttr) {
+		m["swap_memory"] = value
+	}
+}
+
+// Push an element onto the stack.
 //
-// If `x` and `y` are reals, this will return the floating-point division.
+// Arguments:
+//	handle: The handle to a stack.
+//	elem: The tensor to be pushed onto the stack.
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns The same tensor as the input 'elem'.
+func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RealDiv",
+		Type: "StackPushV2",
 		Input: []tf.Input{
-			x, y,
+			handle, elem,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -26269,241 +26504,6 @@ func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional .
 	return op.Output(0)
 }
 
-// RpcAttr is an optional argument to Rpc.
-type RpcAttr func(optionalAttr)
-
-// RpcProtocol sets the optional protocol attribute to value.
-//
-// value: RPC protocol to use.  Empty string means use the default protocol.
-// Options include 'grpc'.
-// If not specified, defaults to ""
-func RpcProtocol(value string) RpcAttr {
-	return func(m optionalAttr) {
-		m["protocol"] = value
-	}
-}
-
-// RpcFailFast sets the optional fail_fast attribute to value.
-//
-// value: `boolean`. If `true` (default), then failures to connect
-// (i.e., the server does not immediately respond) cause an RPC failure.
-// If not specified, defaults to true
-func RpcFailFast(value bool) RpcAttr {
-	return func(m optionalAttr) {
-		m["fail_fast"] = value
-	}
-}
-
-// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
-//
-// value: `int`. If `0` (default), then the kernel will run the RPC
-// request and only time out if the RPC deadline passes or the session times out.
-// If this value is greater than `0`, then the op will raise an exception if
-// the RPC takes longer than `timeout_in_ms`.
-// If not specified, defaults to 0
-func RpcTimeoutInMs(value int64) RpcAttr {
-	return func(m optionalAttr) {
-		m["timeout_in_ms"] = value
-	}
-}
-
-// Perform batches of RPC requests.
-//
-// This op asynchronously performs either a single RPC request, or a batch
-// of requests.  RPC requests are defined by three main parameters:
-//
-//   - `address` (the host+port or BNS address of the request)
-//   - `method` (the RPC method name for the request)
-//   - `request` (the serialized proto string, or vector of strings,
-//      of the RPC request argument).
-//
-// For example, if you have an RPC service running on port localhost:2345,
-// and its interface is configured with the following proto declaration:
-//
-// ```
-// service MyService {
-//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
-//   }
-// };
-// ```
-//
-// then call this op with arguments:
-//
-// ```
-// address = "localhost:2345"
-// method = "MyService/MyMethod"
-// ```
-//
-// The `request` tensor is a string tensor representing serialized `MyRequestProto`
-// strings; and the output string tensor `response` will have the same shape
-// and contain (upon successful completion) corresponding serialized
-// `MyResponseProto` strings.
-//
-// For example, to send a single, empty, `MyRequestProto`, call
-// this op with `request = ""`.  To send 5 **parallel** empty requests,
-// call this op with `request = ["", "", "", "", ""]`.
-//
-// More generally, one can create a batch of `MyRequestProto` serialized protos
-// from regular batched tensors using the `encode_proto` op, and convert
-// the response `MyResponseProto` serialized protos to batched tensors
-// using the `decode_proto` op.
-//
-// **NOTE** Working with serialized proto strings is faster than instantiating
-// actual proto objects in memory, so no performance degradation is expected
-// compared to writing custom kernels for this workflow.
-//
-// If the connection fails or the remote worker returns an error
-// status, the op reraises this exception locally.
-//
-// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
-//
-// Arguments:
-//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `method` and `request`.
-//	method: `0-D` or `1-D`.  The method address on the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `request`.
-//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `method`.
-//
-// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
-func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Rpc",
-		Input: []tf.Input{
-			address, method, request,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OrderedMapStageAttr is an optional argument to OrderedMapStage.
-type OrderedMapStageAttr func(optionalAttr)
-
-// OrderedMapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapStageContainer sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func OrderedMapStageContainer(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapStageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Stage (key, values) in the underlying container which behaves like a ordered
-//
-// associative container.   Elements are ordered by key.
-//
-// Arguments:
-//	key: int64
-//
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
-//
-// Returns the created operation.
-func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OrderedMapStage",
-		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// StackPushV2Attr is an optional argument to StackPushV2.
-type StackPushV2Attr func(optionalAttr)
-
-// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
-//
-// value: Swap `elem` to CPU. Default to false.
-// If not specified, defaults to false
-func StackPushV2SwapMemory(value bool) StackPushV2Attr {
-	return func(m optionalAttr) {
-		m["swap_memory"] = value
-	}
-}
-
-// Push an element onto the stack.
-//
-// Arguments:
-//	handle: The handle to a stack.
-//	elem: The tensor to be pushed onto the stack.
-//
-// Returns The same tensor as the input 'elem'.
-func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StackPushV2",
-		Input: []tf.Input{
-			handle, elem,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
 type FusedBatchNormGradV2Attr func(optionalAttr)
 
-- 
GitLab


From a4b0b02ef66586ac98d558099a37662a892f14f1 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Thu, 19 Apr 2018 00:28:33 -0700
Subject: [PATCH 131/450] docs: Add a note on building the C and/or Java API
 binaries from source.

See #15290

PiperOrigin-RevId: 193473886
---
 .../docs_src/install/install_sources.md       | 33 +++++++++++++++----
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 7d7c2aa75a..26287aa3a1 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -241,12 +241,12 @@ One of the questions that `configure` will ask is as follows:
 Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is -march=native]
 </pre>
 
-This question refers to a later phase in which you'll use bazel to
-[build the pip package](#build-the-pip-package).  We recommend
-accepting the default (`-march=native`), which will
-optimize the generated code for your local machine's CPU type.  However,
-if you are building TensorFlow on one CPU type but will run TensorFlow on
-a different CPU type, then consider specifying a more specific optimization
+This question refers to a later phase in which you'll use bazel to [build the
+pip package](#build-the-pip-package) or the [C/Java libraries](#BuildCorJava).
+We recommend accepting the default (`-march=native`), which will optimize the
+generated code for your local machine's CPU type.  However, if you are building
+TensorFlow on one CPU type but will run TensorFlow on a different CPU type, then
+consider specifying a more specific optimization
 flag as described in [the gcc
 documentation](https://gcc.gnu.org/onlinedocs/gcc-4.5.3/gcc/i386-and-x86_002d64-Options.html).
 
@@ -311,6 +311,10 @@ Note the following:
 
 ## Build the pip package
 
+Note: If you're only interested in building the libraries for the TensorFlow C
+or Java APIs, see [Build the C or Java libraries](#BuildCorJava), you do not
+need to build the pip package in that case.
+
 To build a pip package for TensorFlow with CPU-only support,
 you would typically invoke the following command:
 
@@ -503,3 +507,20 @@ Stack Overflow and specify the `tensorflow` tag.
 <tr><td>tensorflow-1.0.0</td><td>CPU</td><td>3.5</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.0.0</td><td>GPU</td><td>3.5</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>5.1</td><td>8</td></tr>
 </table>
+
+<a name="BuildCorJava"></a>
+## Build the C or Java libraries
+
+The instructions above are tailored to building the TensorFlow Python packages.
+
+If you're interested in building the libraries for the TensorFlow C API, do the
+following:
+
+1.  Follow the steps up to [Configure the installation](#ConfigureInstallation)
+2.  Build the C libraries following instructions in the [README](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/lib_package/README.md).
+
+If you're interested inv building the libraries for the TensorFlow Java API,
+do the following:
+
+1.  Follow the steps up to [Configure the installation](#ConfigureInstallation)
+2.  Build the Java library following instructions in the [README](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/lib_package/README.md).
-- 
GitLab


From d218339e6a05a984ef7b9a49d66db219d862936e Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Thu, 19 Apr 2018 01:26:07 -0700
Subject: [PATCH 132/450] Remove proto import in header files for
 core/kernels/boosted_trees. Move implementations that requires declaration of
 TreeEnsemble to .cc files.

The goal is to make kernels mostly independent of proto headers, which will let us lock down our .so import

PiperOrigin-RevId: 193478404
---
 .../core/kernels/boosted_trees/resources.cc   | 138 ++++++++++++++++
 .../core/kernels/boosted_trees/resources.h    | 154 +++++-------------
 2 files changed, 178 insertions(+), 114 deletions(-)

diff --git a/tensorflow/core/kernels/boosted_trees/resources.cc b/tensorflow/core/kernels/boosted_trees/resources.cc
index 2ea12c522c..c410748c27 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.cc
+++ b/tensorflow/core/kernels/boosted_trees/resources.cc
@@ -21,6 +21,35 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Constructor.
+BoostedTreesEnsembleResource::BoostedTreesEnsembleResource()
+    : tree_ensemble_(
+          protobuf::Arena::CreateMessage<boosted_trees::TreeEnsemble>(
+              &arena_)) {}
+
+string BoostedTreesEnsembleResource::DebugString() {
+  return strings::StrCat("TreeEnsemble[size=", tree_ensemble_->trees_size(),
+                         "]");
+}
+
+bool BoostedTreesEnsembleResource::InitFromSerialized(const string& serialized,
+                                                      const int64 stamp_token) {
+  CHECK_EQ(stamp(), -1) << "Must Reset before Init.";
+  if (ParseProtoUnlimited(tree_ensemble_, serialized)) {
+    set_stamp(stamp_token);
+    return true;
+  }
+  return false;
+}
+
+string BoostedTreesEnsembleResource::SerializeAsString() const {
+  return tree_ensemble_->SerializeAsString();
+}
+
+int32 BoostedTreesEnsembleResource::num_trees() const {
+  return tree_ensemble_->trees_size();
+}
+
 int32 BoostedTreesEnsembleResource::next_node(
     const int32 tree_id, const int32 node_id, const int32 index_in_batch,
     const std::vector<TTypes<int32>::ConstVec>& bucketized_features) const {
@@ -49,6 +78,115 @@ float BoostedTreesEnsembleResource::node_value(const int32 tree_id,
   }
 }
 
+int32 BoostedTreesEnsembleResource::GetNumLayersGrown(
+    const int32 tree_id) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  return tree_ensemble_->tree_metadata(tree_id).num_layers_grown();
+}
+
+void BoostedTreesEnsembleResource::SetNumLayersGrown(
+    const int32 tree_id, int32 new_num_layers) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  tree_ensemble_->mutable_tree_metadata(tree_id)->set_num_layers_grown(
+      new_num_layers);
+}
+
+void BoostedTreesEnsembleResource::UpdateLastLayerNodesRange(
+    const int32 node_range_start, int32 node_range_end) const {
+  tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_start(
+      node_range_start);
+  tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_end(
+      node_range_end);
+}
+
+void BoostedTreesEnsembleResource::GetLastLayerNodesRange(
+    int32* node_range_start, int32* node_range_end) const {
+  *node_range_start =
+      tree_ensemble_->growing_metadata().last_layer_node_start();
+  *node_range_end = tree_ensemble_->growing_metadata().last_layer_node_end();
+}
+
+int64 BoostedTreesEnsembleResource::GetNumNodes(const int32 tree_id) {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  return tree_ensemble_->trees(tree_id).nodes_size();
+}
+
+int32 BoostedTreesEnsembleResource::GetNumLayersAttempted() {
+  return tree_ensemble_->growing_metadata().num_layers_attempted();
+}
+
+bool BoostedTreesEnsembleResource::is_leaf(const int32 tree_id,
+                                           const int32 node_id) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  DCHECK_LT(node_id, tree_ensemble_->trees(tree_id).nodes_size());
+  const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  return node.node_case() == boosted_trees::Node::kLeaf;
+}
+
+int32 BoostedTreesEnsembleResource::feature_id(const int32 tree_id,
+                                               const int32 node_id) const {
+  const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+  return node.bucketized_split().feature_id();
+}
+
+int32 BoostedTreesEnsembleResource::bucket_threshold(
+    const int32 tree_id, const int32 node_id) const {
+  const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+  return node.bucketized_split().threshold();
+}
+
+int32 BoostedTreesEnsembleResource::left_id(const int32 tree_id,
+                                            const int32 node_id) const {
+  const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+  return node.bucketized_split().left_id();
+}
+
+int32 BoostedTreesEnsembleResource::right_id(const int32 tree_id,
+                                             const int32 node_id) const {
+  const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+  return node.bucketized_split().right_id();
+}
+
+std::vector<float> BoostedTreesEnsembleResource::GetTreeWeights() const {
+  return {tree_ensemble_->tree_weights().begin(),
+          tree_ensemble_->tree_weights().end()};
+}
+
+float BoostedTreesEnsembleResource::GetTreeWeight(const int32 tree_id) const {
+  return tree_ensemble_->tree_weights(tree_id);
+}
+
+float BoostedTreesEnsembleResource::IsTreeFinalized(const int32 tree_id) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  return tree_ensemble_->tree_metadata(tree_id).is_finalized();
+}
+
+float BoostedTreesEnsembleResource::IsTreePostPruned(
+    const int32 tree_id) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  return tree_ensemble_->tree_metadata(tree_id).post_pruned_nodes_meta_size() >
+         0;
+}
+
+void BoostedTreesEnsembleResource::SetIsFinalized(const int32 tree_id,
+                                                  const bool is_finalized) {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  return tree_ensemble_->mutable_tree_metadata(tree_id)->set_is_finalized(
+      is_finalized);
+}
+
+// Sets the weight of i'th tree.
+void BoostedTreesEnsembleResource::SetTreeWeight(const int32 tree_id,
+                                                 const float weight) {
+  DCHECK_GE(tree_id, 0);
+  DCHECK_LT(tree_id, num_trees());
+  tree_ensemble_->set_tree_weights(tree_id, weight);
+}
+
 void BoostedTreesEnsembleResource::UpdateGrowingMetadata() const {
   tree_ensemble_->mutable_growing_metadata()->set_num_layers_attempted(
       tree_ensemble_->growing_metadata().num_layers_attempted() + 1);
diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h
index 561ca3a18a..df78d3f275 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.h
+++ b/tensorflow/core/kernels/boosted_trees/resources.h
@@ -17,12 +17,16 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_RESOURCES_H_
 
 #include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/kernels/boosted_trees/boosted_trees.pb.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 
+// Forward declaration for proto class TreeEnsemble
+namespace boosted_trees {
+class TreeEnsemble;
+}  // namespace boosted_trees
+
 // A StampedResource is a resource that has a stamp token associated with it.
 // Before reading from or applying updates to the resource, the stamp should
 // be checked to verify that the update is not stale.
@@ -42,31 +46,15 @@ class StampedResource : public ResourceBase {
 // Keep a tree ensemble in memory for efficient evaluation and mutation.
 class BoostedTreesEnsembleResource : public StampedResource {
  public:
-  // Constructor.
-  BoostedTreesEnsembleResource()
-      : tree_ensemble_(
-            protobuf::Arena::CreateMessage<boosted_trees::TreeEnsemble>(
-                &arena_)) {}
-
-  string DebugString() override {
-    return strings::StrCat("TreeEnsemble[size=", tree_ensemble_->trees_size(),
-                           "]");
-  }
-
-  bool InitFromSerialized(const string& serialized, const int64 stamp_token) {
-    CHECK_EQ(stamp(), -1) << "Must Reset before Init.";
-    if (ParseProtoUnlimited(tree_ensemble_, serialized)) {
-      set_stamp(stamp_token);
-      return true;
-    }
-    return false;
-  }
-
-  string SerializeAsString() const {
-    return tree_ensemble_->SerializeAsString();
-  }
-
-  int32 num_trees() const { return tree_ensemble_->trees_size(); }
+  BoostedTreesEnsembleResource();
+
+  string DebugString() override;
+
+  bool InitFromSerialized(const string& serialized, const int64 stamp_token);
+
+  string SerializeAsString() const;
+
+  int32 num_trees() const;
 
   // Find the next node to which the example (specified by index_in_batch)
   // traverses down from the current node indicated by tree_id and node_id.
@@ -82,73 +70,31 @@ class BoostedTreesEnsembleResource : public StampedResource {
 
   float node_value(const int32 tree_id, const int32 node_id) const;
 
-  int32 GetNumLayersGrown(const int32 tree_id) const {
-    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
-    return tree_ensemble_->tree_metadata(tree_id).num_layers_grown();
-  }
+  int32 GetNumLayersGrown(const int32 tree_id) const;
 
-  void SetNumLayersGrown(const int32 tree_id, int32 new_num_layers) const {
-    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
-    tree_ensemble_->mutable_tree_metadata(tree_id)->set_num_layers_grown(
-        new_num_layers);
-  }
+  void SetNumLayersGrown(const int32 tree_id, int32 new_num_layers) const;
 
   void UpdateLastLayerNodesRange(const int32 node_range_start,
-                                 int32 node_range_end) const {
-    tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_start(
-        node_range_start);
-    tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_end(
-        node_range_end);
-  }
+                                 int32 node_range_end) const;
 
   void GetLastLayerNodesRange(int32* node_range_start,
-                              int32* node_range_end) const {
-    *node_range_start =
-        tree_ensemble_->growing_metadata().last_layer_node_start();
-    *node_range_end = tree_ensemble_->growing_metadata().last_layer_node_end();
-  }
+                              int32* node_range_end) const;
 
-  int64 GetNumNodes(const int32 tree_id) {
-    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
-    return tree_ensemble_->trees(tree_id).nodes_size();
-  }
+  int64 GetNumNodes(const int32 tree_id);
 
   void UpdateGrowingMetadata() const;
 
-  int32 GetNumLayersAttempted() {
-    return tree_ensemble_->growing_metadata().num_layers_attempted();
-  }
-
-  bool is_leaf(const int32 tree_id, const int32 node_id) const {
-    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
-    DCHECK_LT(node_id, tree_ensemble_->trees(tree_id).nodes_size());
-    const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id);
-    return node.node_case() == boosted_trees::Node::kLeaf;
-  }
-
-  int32 feature_id(const int32 tree_id, const int32 node_id) const {
-    const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
-    DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
-    return node.bucketized_split().feature_id();
-  }
-
-  int32 bucket_threshold(const int32 tree_id, const int32 node_id) const {
-    const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
-    DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
-    return node.bucketized_split().threshold();
-  }
-
-  int32 left_id(const int32 tree_id, const int32 node_id) const {
-    const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
-    DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
-    return node.bucketized_split().left_id();
-  }
-
-  int32 right_id(const int32 tree_id, const int32 node_id) const {
-    const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
-    DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
-    return node.bucketized_split().right_id();
-  }
+  int32 GetNumLayersAttempted();
+
+  bool is_leaf(const int32 tree_id, const int32 node_id) const;
+
+  int32 feature_id(const int32 tree_id, const int32 node_id) const;
+
+  int32 bucket_threshold(const int32 tree_id, const int32 node_id) const;
+
+  int32 left_id(const int32 tree_id, const int32 node_id) const;
+
+  int32 right_id(const int32 tree_id, const int32 node_id) const;
 
   // Add a tree to the ensemble and returns a new tree_id.
   int32 AddNewTree(const float weight);
@@ -163,38 +109,18 @@ class BoostedTreesEnsembleResource : public StampedResource {
   // Retrieves tree weights and returns as a vector.
   // It involves a copy, so should be called only sparingly (like once per
   // iteration, not per example).
-  std::vector<float> GetTreeWeights() const {
-    return {tree_ensemble_->tree_weights().begin(),
-            tree_ensemble_->tree_weights().end()};
-  }
-
-  float GetTreeWeight(const int32 tree_id) const {
-    return tree_ensemble_->tree_weights(tree_id);
-  }
-
-  float IsTreeFinalized(const int32 tree_id) const {
-    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
-    return tree_ensemble_->tree_metadata(tree_id).is_finalized();
-  }
-
-  float IsTreePostPruned(const int32 tree_id) const {
-    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
-    return tree_ensemble_->tree_metadata(tree_id)
-               .post_pruned_nodes_meta_size() > 0;
-  }
-
-  void SetIsFinalized(const int32 tree_id, const bool is_finalized) {
-    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
-    return tree_ensemble_->mutable_tree_metadata(tree_id)->set_is_finalized(
-        is_finalized);
-  }
+  std::vector<float> GetTreeWeights() const;
+
+  float GetTreeWeight(const int32 tree_id) const;
+
+  float IsTreeFinalized(const int32 tree_id) const;
+
+  float IsTreePostPruned(const int32 tree_id) const;
+
+  void SetIsFinalized(const int32 tree_id, const bool is_finalized);
 
   // Sets the weight of i'th tree.
-  void SetTreeWeight(const int32 tree_id, const float weight) {
-    DCHECK_GE(tree_id, 0);
-    DCHECK_LT(tree_id, num_trees());
-    tree_ensemble_->set_tree_weights(tree_id, weight);
-  }
+  void SetTreeWeight(const int32 tree_id, const float weight);
 
   // Resets the resource and frees the protos in arena.
   // Caller needs to hold the mutex lock while calling this.
-- 
GitLab


From b2536f05bb156612c96f204041ea31980b711fc8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 01:56:31 -0700
Subject: [PATCH 133/450] Update feature_util's GetFeatures to show
 compile-time error for unsupported types instead of a link-time error.

PiperOrigin-RevId: 193480683
---
 tensorflow/core/example/feature_util.h | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/example/feature_util.h b/tensorflow/core/example/feature_util.h
index d977935b8a..2265498b5e 100644
--- a/tensorflow/core/example/feature_util.h
+++ b/tensorflow/core/example/feature_util.h
@@ -182,13 +182,25 @@ struct FeatureTrait<
 // Returns true if sequence_example has a feature_list with the specified key.
 bool HasFeatureList(const string& key, const SequenceExample& sequence_example);
 
+template <typename T>
+struct TypeHasFeatures : std::false_type {};
+
+template <>
+struct TypeHasFeatures<Example> : std::true_type {};
+
+template <>
+struct TypeHasFeatures<Features> : std::true_type {};
+
 // A family of template functions to return mutable Features proto from a
 // container proto. Supported ProtoTypes: Example, Features.
 template <typename ProtoType>
-Features* GetFeatures(ProtoType* proto);
+typename std::enable_if<TypeHasFeatures<ProtoType>::value, Features*>::type
+GetFeatures(ProtoType* proto);
 
 template <typename ProtoType>
-const Features& GetFeatures(const ProtoType& proto);
+typename std::enable_if<TypeHasFeatures<ProtoType>::value,
+                        const Features&>::type
+GetFeatures(const ProtoType& proto);
 
 // Base declaration of a family of template functions to return a read only
 // repeated field of feature values.
@@ -300,7 +312,7 @@ bool HasFeature(const string& key, const Features& features);
 template <typename... FeatureType>
 bool HasFeature(const string& key, const Example& example) {
   return HasFeature<FeatureType...>(key, GetFeatures(example));
-};
+}
 
 // DEPRECATED: use HasFeature instead.
 // TODO(gorban): update all clients in a followup CL.
-- 
GitLab


From 5fb3c64421f53aa7ef58ffcee6de47cd4a40fe2d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 02:58:31 -0700
Subject: [PATCH 134/450] Set the random seed in on-demand mode.

PiperOrigin-RevId: 193488103
---
 tensorflow/compiler/jit/xla_compile_on_demand_op.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index 682d6ea8cc..6c2782e28e 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -67,6 +67,7 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
   run_options.set_stream(stream);
   run_options.set_allocator(client->backend().memory_allocator());
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
+  run_options.set_rng_seed(ctx->step_id());
 
   auto run_result = executable->Run(launch_context.arguments(), run_options);
   TF_RETURN_IF_ERROR(run_result.status());
-- 
GitLab


From bf86d3a46b4e2ef4dabcba211c1ce36cb81ac315 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 04:27:38 -0700
Subject: [PATCH 135/450] Handle corner case in Python 3: members annotated
 with @classmethod.

PiperOrigin-RevId: 193495506
---
 tensorflow/contrib/autograph/pyct/inspect_utils.py   | 12 +++++++-----
 .../contrib/autograph/pyct/inspect_utils_test.py     |  7 +++++++
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils.py b/tensorflow/contrib/autograph/pyct/inspect_utils.py
index a0f56a6c1f..eef74599a7 100644
--- a/tensorflow/contrib/autograph/pyct/inspect_utils.py
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils.py
@@ -75,13 +75,15 @@ def getdefiningclass(m, owner_class):
   """Resolves the class (e.g. one of the superclasses) that defined a method."""
   # Normalize bound functions to their respective unbound versions.
   m = _get_unbound_function(m)
-  last_defining = owner_class
-  for superclass in tf_inspect.getmro(owner_class):
+  for superclass in owner_class.__bases__:
     if hasattr(superclass, m.__name__):
       superclass_m = getattr(superclass, m.__name__)
-      if _get_unbound_function(superclass_m) == m:
-        last_defining = superclass
-  return last_defining
+      if _get_unbound_function(superclass_m) is m:
+        return superclass
+      elif hasattr(m, '__self__') and m.__self__ == owner_class:
+        # Python 3 class methods only work this way it seems :S
+        return superclass
+  return owner_class
 
 
 def getmethodclass(m):
diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
index cf841dae81..1a212f676a 100644
--- a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
@@ -243,6 +243,10 @@ class InspectUtilsTest(test.TestCase):
       def bar(self):
         pass
 
+      @classmethod
+      def class_method(cls):
+        pass
+
     class Subclass(Superclass):
 
       def foo(self):
@@ -257,6 +261,9 @@ class InspectUtilsTest(test.TestCase):
         inspect_utils.getdefiningclass(Subclass.bar, Subclass) is Superclass)
     self.assertTrue(
         inspect_utils.getdefiningclass(Subclass.baz, Subclass) is Subclass)
+    self.assertTrue(
+        inspect_utils.getdefiningclass(Subclass.class_method, Subclass) is
+        Superclass)
 
   def test_isbuiltin(self):
     self.assertTrue(inspect_utils.isbuiltin(range))
-- 
GitLab


From 06d802ab61987bde76a30098ff7930c27d561375 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 05:11:30 -0700
Subject: [PATCH 136/450] Support for converting entire class hierarchies:  *
 limit the methods being converted to those that have not been inherited from
 the superclass  * include the (possibly compiled) superclass in the
 definition of the compiled class  * either mark the superclass for conversion
 or generate an absolute aliased import line, depending on whether it's
 whitelisted

PiperOrigin-RevId: 193499204
---
 .../autograph/converters/call_trees.py        | 10 ++--
 tensorflow/contrib/autograph/impl/api.py      |  2 +-
 .../contrib/autograph/impl/conversion.py      | 58 +++++++++++++++---
 .../contrib/autograph/impl/conversion_test.py | 60 +++++++++++++++++++
 4 files changed, 117 insertions(+), 13 deletions(-)

diff --git a/tensorflow/contrib/autograph/converters/call_trees.py b/tensorflow/contrib/autograph/converters/call_trees.py
index e390d1a262..2e5590b46c 100644
--- a/tensorflow/contrib/autograph/converters/call_trees.py
+++ b/tensorflow/contrib/autograph/converters/call_trees.py
@@ -245,8 +245,6 @@ class CallTreeTransformer(transformer.Base):
     new_call.keywords = node.keywords
     return new_call
 
-  # pylint:disable=invalid-name
-
   def visit_Expr(self, node):
     if isinstance(node.value, gast.Call):
       if anno.hasanno(node.value.func, 'live_val'):
@@ -294,15 +292,17 @@ class CallTreeTransformer(transformer.Base):
         raise NotImplementedError(
             'py_func with return values (unknown function)')
     else:
-      if self.context.recursive:
+      if ast_util.matches(node, 'super(_)'):
+        # super() calls are preserved. The class conversion mechanism will
+        # ensure that they return the correct value.
+        pass
+      elif self.context.recursive:
         node = self._insert_dynamic_conversion(node)
       else:
         # Unresolved functions are allowed in non-recursive mode.
         pass
     return node
 
-  # pylint:enable=invalid-name
-
 
 def transform(node, context, uncompiled_modules, nocompile_decorators):
   """Transform function call to the compiled counterparts.
diff --git a/tensorflow/contrib/autograph/impl/api.py b/tensorflow/contrib/autograph/impl/api.py
index f97a33326e..d874ef15c9 100644
--- a/tensorflow/contrib/autograph/impl/api.py
+++ b/tensorflow/contrib/autograph/impl/api.py
@@ -241,7 +241,7 @@ def to_graph(e,
   module = gast.Module([])
   for import_line in config.COMPILED_IMPORT_STATEMENTS:
     module.body.extend(parser.parse_str(import_line).body)
-  for dep in conversion_map.dependency_cache.values():
+  for dep in reversed(conversion_map.dependency_cache.values()):
     module.body.append(dep)
   compiled_node, compiled_src = compiler.ast_to_object(module)
 
diff --git a/tensorflow/contrib/autograph/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py
index 5653e991f6..e7230a5f45 100644
--- a/tensorflow/contrib/autograph/impl/conversion.py
+++ b/tensorflow/contrib/autograph/impl/conversion.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import imp
 
 import gast
@@ -39,6 +40,7 @@ from tensorflow.contrib.autograph.converters import side_effect_guards
 from tensorflow.contrib.autograph.converters import single_return
 from tensorflow.contrib.autograph.impl import config
 from tensorflow.contrib.autograph.impl import naming
+from tensorflow.contrib.autograph.pyct import ast_util
 from tensorflow.contrib.autograph.pyct import context
 from tensorflow.contrib.autograph.pyct import inspect_utils
 from tensorflow.contrib.autograph.pyct import parser
@@ -81,7 +83,9 @@ class ConversionMap(object):
     self.recursive = recursive
     self.nocompile_decorators = nocompile_decorators
     self.partial_types = partial_types if partial_types else ()
-    self.dependency_cache = {}
+    # Required to output dependencies in discovery order, which should match
+    # the reverse dependency order.
+    self.dependency_cache = collections.OrderedDict()
     self.additional_imports = set()
     self.name_map = {}
     self.api_module = api_module
@@ -201,6 +205,9 @@ def class_to_graph(c, conversion_map):
 
   class_namespace = {}
   for _, m in members:
+    # Only convert the members that are directly defined by the class.
+    if inspect_utils.getdefiningclass(m, c) is not c:
+      continue
     node, _, namespace = function_to_graph(
         m,
         conversion_map=conversion_map,
@@ -214,12 +221,49 @@ def class_to_graph(c, conversion_map):
     converted_members[m] = node
   namer = conversion_map.new_namer(class_namespace)
   class_name = namer.compiled_class_name(c.__name__, c)
-  node = gast.ClassDef(
-      class_name,
-      bases=[],
-      keywords=[],
-      body=list(converted_members.values()),
-      decorator_list=[])
+
+  # TODO(mdan): This needs to be explained more thoroughly.
+  # Process any base classes: if the sueprclass if of a whitelisted type, an
+  # absolute import line is generated. Otherwise, it is marked for conversion
+  # (as a side effect of the call to namer.compiled_class_name() followed by
+  # conversion_map.update_name_map(namer)).
+  output_nodes = []
+  renames = {}
+  bases = []
+  for base in c.__bases__:
+    if isinstance(object, base):
+      bases.append('object')
+      continue
+    if is_whitelisted_for_graph(base):
+      alias = namer.new_symbol(base.__name__, ())
+      output_nodes.append(
+          gast.ImportFrom(
+              module=base.__module__,
+              names=[gast.alias(name=base.__name__, asname=alias)],
+              level=0))
+    else:
+      # This will trigger a conversion into a class with this name.
+      alias = namer.compiled_class_name(base.__name__, base)
+    bases.append(alias)
+    renames[qual_names.QN(base.__name__)] = qual_names.QN(alias)
+  conversion_map.update_name_map(namer)
+
+  # Generate the definition of the converted class.
+  output_nodes.append(
+      gast.ClassDef(
+          class_name,
+          bases=bases,
+          keywords=[],
+          body=list(converted_members.values()),
+          decorator_list=[]))
+  node = gast.Module(output_nodes)
+
+  # Make a final pass to replace references to the class or its base classes.
+  # Most commonly, this occurs when making super().__init__() calls.
+  # TODO(mdan): Making direct references to superclass' superclass will fail.
+  node = qual_names.resolve(node)
+  renames[qual_names.QN(c.__name__)] = qual_names.QN(class_name)
+  node = ast_util.rename_symbols(node, renames)
 
   return node, class_name, class_namespace
 
diff --git a/tensorflow/contrib/autograph/impl/conversion_test.py b/tensorflow/contrib/autograph/impl/conversion_test.py
index da3220892f..5edd8e74a8 100644
--- a/tensorflow/contrib/autograph/impl/conversion_test.py
+++ b/tensorflow/contrib/autograph/impl/conversion_test.py
@@ -24,6 +24,7 @@ from tensorflow.contrib.autograph import utils
 from tensorflow.contrib.autograph.impl import api
 from tensorflow.contrib.autograph.impl import conversion
 from tensorflow.python.framework import constant_op
+from tensorflow.python.keras._impl.keras.engine import training
 from tensorflow.python.platform import test
 
 
@@ -78,6 +79,65 @@ class ConversionTest(test.TestCase):
         conversion_map.dependency_cache[f].body[0].body[0].value.func.id)
     self.assertEqual('tf__g', conversion_map.dependency_cache[g].name)
 
+  def test_entity_to_graph_class_hierarchy(self):
+
+    class TestBase(object):
+
+      def __init__(self, x='base'):
+        self.x = x
+
+      def foo(self):
+        return self.x
+
+      def bar(self):
+        return self.x
+
+    class TestSubclass(TestBase):
+
+      def __init__(self, y):
+        super(TestSubclass, self).__init__('sub')
+        self.y = y
+
+      def foo(self):
+        return self.y
+
+      def baz(self):
+        return self.y
+
+    conversion_map = self._simple_conversion_map()
+    conversion.entity_to_graph(TestSubclass, conversion_map, None, None)
+
+    self.assertTrue(TestBase in conversion_map.dependency_cache)
+    self.assertTrue(TestSubclass in conversion_map.dependency_cache)
+    self.assertEqual('TfTestBase',
+                     conversion_map.dependency_cache[TestBase].body[-1].name)
+    self.assertEqual(
+        'TfTestSubclass',
+        conversion_map.dependency_cache[TestSubclass].body[-1].name)
+
+  def test_entity_to_graph_class_hierarchy_whitelisted(self):
+
+    class TestSubclass(training.Model):
+
+      def __init__(self, y):
+        super(TestSubclass, self).__init__()
+        self.built = False
+
+      def call(self, x):
+        return 3 * x
+
+    conversion_map = self._simple_conversion_map()
+    conversion.entity_to_graph(TestSubclass, conversion_map, None, None)
+
+    self.assertTrue(TestSubclass in conversion_map.dependency_cache)
+    self.assertFalse(training.Model in conversion_map.dependency_cache)
+    self.assertEqual(
+        'Model',
+        conversion_map.dependency_cache[TestSubclass].body[0].names[0].name)
+    self.assertEqual(
+        'TfTestSubclass',
+        conversion_map.dependency_cache[TestSubclass].body[-1].name)
+
   def test_entity_to_graph_lambda(self):
     f = lambda a: a
 
-- 
GitLab


From 40f77655affb162d32b7d4861fa68c35fc3d8f7a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 06:58:34 -0700
Subject: [PATCH 137/450] Update the Colorbot demo to use a Keras model in
 addition to the Estimator.

PiperOrigin-RevId: 193508874
---
 ...imator.ipynb => rnn_keras_estimator.ipynb} | 677 +++++-------------
 1 file changed, 167 insertions(+), 510 deletions(-)
 rename tensorflow/contrib/autograph/examples/notebooks/{rnn_colorbot_estimator.ipynb => rnn_keras_estimator.ipynb} (50%)

diff --git a/tensorflow/contrib/autograph/examples/notebooks/rnn_colorbot_estimator.ipynb b/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb
similarity index 50%
rename from tensorflow/contrib/autograph/examples/notebooks/rnn_colorbot_estimator.ipynb
rename to tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb
index 7f5e4d4ac1..324b23c24b 100644
--- a/tensorflow/contrib/autograph/examples/notebooks/rnn_colorbot_estimator.ipynb
+++ b/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb
@@ -62,7 +62,7 @@
         }
       },
       "source": [
-        "# Case study: building an RNN\n"
+        "# Case study: training a custom RNN, using Keras and Estimators\n"
       ]
     },
     {
@@ -118,6 +118,16 @@
         "  length = tf.cast(tf.shape(chars)[0], dtype=tf.int64)\n",
         "  return rgb, chars, length\n",
         "\n",
+        "\n",
+        "def set_static_batch_shape(batch_size):\n",
+        "  def apply(rgb, chars, length):\n",
+        "    rgb.set_shape((batch_size, None))\n",
+        "    chars.set_shape((batch_size, None, 256))\n",
+        "    length.set_shape((batch_size,))\n",
+        "    return rgb, chars, length\n",
+        "  return apply\n",
+        "\n",
+        "\n",
         "def load_dataset(data_dir, url, batch_size, training=True):\n",
         "  \"\"\"Loads the colors data at path into a tf.PaddedDataset.\"\"\"\n",
         "  path = tf.keras.utils.get_file(os.path.basename(url), url, cache_dir=data_dir)\n",
@@ -129,7 +139,10 @@
         "  if training:\n",
         "    dataset = dataset.shuffle(buffer_size=3000)\n",
         "  dataset = dataset.padded_batch(\n",
-        "      batch_size, padded_shapes=([None], [None, None], []))\n",
+        "      batch_size, padded_shapes=((None,), (None, 256), ()))\n",
+        "  # To simplify the model code, we statically set as many of the shapes that we\n",
+        "  # know.\n",
+        "  dataset = dataset.map(set_static_batch_shape(batch_size))\n",
         "  return dataset"
       ]
     },
@@ -145,7 +158,8 @@
       "source": [
         "To show the use of control flow, we write the RNN loop by hand, rather than using a pre-built RNN model.\n",
         "\n",
-        "Note how we write the model code in Eager style, with regular `if` and `while` statements. Then, we annotate the functions with `@autograph.convert` to have them automatically compiled to run in graph mode."
+        "Note how we write the model code in Eager style, with regular `if` and `while` statements. Then, we annotate the functions with `@autograph.convert` to have them automatically compiled to run in graph mode.\n",
+        "We use Keras to define the model, and we will train it using Estimators."
       ]
     },
     {
@@ -166,70 +180,72 @@
       },
       "outputs": [],
       "source": [
-        "class RnnColorbot(object):\n",
-        "  \"\"\"Holds the parameters of the colorbot model.\"\"\"\n",
+        "@autograph.convert()\n",
+        "class RnnColorbot(tf.keras.Model):\n",
+        "  \"\"\"RNN Colorbot model.\"\"\"\n",
         "\n",
         "  def __init__(self):\n",
+        "    super(RnnColorbot, self).__init__()\n",
         "    self.lower_cell = tf.contrib.rnn.LSTMBlockCell(256)\n",
         "    self.upper_cell = tf.contrib.rnn.LSTMBlockCell(128)\n",
         "    self.relu_layer = tf.layers.Dense(3, activation=tf.nn.relu)\n",
         "\n",
+        "\n",
+        "  def _rnn_layer(self, chars, cell, batch_size, training):\n",
+        "    \"\"\"A single RNN layer.\n",
+        "\n",
+        "    Args:\n",
+        "      chars: A Tensor of shape (max_sequence_length, batch_size, input_size)\n",
+        "      cell: An object of type tf.contrib.rnn.LSTMBlockCell\n",
+        "      batch_size: Int, the batch size to use\n",
+        "      training: Boolean, whether the layer is used for training\n",
+        "\n",
+        "    Returns:\n",
+        "      A Tensor of shape (max_sequence_length, batch_size, output_size).\n",
+        "    \"\"\"\n",
+        "    hidden_outputs = []\n",
+        "    autograph.utils.set_element_type(hidden_outputs, tf.float32)\n",
+        "    state, output = cell.zero_state(batch_size, tf.float32)\n",
+        "    for ch in chars:\n",
+        "      cell_output, (state, output) = cell.call(ch, (state, output))\n",
+        "      hidden_outputs.append(cell_output)\n",
+        "    hidden_outputs = hidden_outputs.stack()\n",
+        "    if training:\n",
+        "      hidden_outputs = tf.nn.dropout(hidden_outputs, 0.5)\n",
+        "    return hidden_outputs\n",
+        "\n",
+        "  def build(self, _):\n",
+        "    \"\"\"Creates the model variables. See keras.Model.build().\"\"\"\n",
         "    self.lower_cell.build(tf.TensorShape((None, 256)))\n",
         "    self.upper_cell.build(tf.TensorShape((None, 256)))\n",
-        "    self.relu_layer.build(tf.TensorShape((None, 128)))\n",
+        "    self.relu_layer.build(tf.TensorShape((None, 128)))    \n",
+        "    self.built = True\n",
         "\n",
         "\n",
-        "def rnn_layer(chars, cell, batch_size, training):\n",
-        "  \"\"\"A simple RNN layer.\n",
-        "  \n",
-        "  Args:\n",
-        "    chars: A Tensor of shape (max_sequence_length, batch_size, input_size)\n",
-        "    cell: An object of type tf.contrib.rnn.LSTMBlockCell\n",
-        "    batch_size: Int, the batch size to use\n",
-        "    training: Boolean, whether the layer is used for training\n",
+        "  def call(self, inputs, training=False):\n",
+        "    \"\"\"The RNN model code. Uses Eager and \n",
         "\n",
-        "  Returns:\n",
-        "    A Tensor of shape (max_sequence_length, batch_size, output_size).\n",
-        "  \"\"\"\n",
-        "  hidden_outputs = []\n",
-        "  autograph.utils.set_element_type(hidden_outputs, tf.float32)\n",
-        "  state, output = cell.zero_state(batch_size, tf.float32)\n",
-        "  for ch in chars:\n",
-        "    cell_output, (state, output) = cell.call(ch, (state, output))\n",
-        "    hidden_outputs.append(cell_output)\n",
-        "  hidden_outputs = hidden_outputs.stack()\n",
-        "  if training:\n",
-        "    hidden_outputs = tf.nn.dropout(hidden_outputs, 0.5)\n",
-        "  return hidden_outputs\n",
+        "    The model consists of two RNN layers (made by lower_cell and upper_cell),\n",
+        "    followed by a fully connected layer with ReLU activation.\n",
         "\n",
+        "    Args:\n",
+        "      inputs: A tuple (chars, length)\n",
+        "      training: Boolean, whether the layer is used for training\n",
         "\n",
-        "@autograph.convert(recursive=True)\n",
-        "def model(inputs, colorbot, batch_size, training):\n",
-        "  \"\"\"RNNColorbot model.\n",
-        "  \n",
-        "  The model consists of two RNN layers (made by lower_cell and upper_cell),\n",
-        "  followed by a fully connected layer with ReLU activation.\n",
-        "  \n",
-        "  Args:\n",
-        "    inputs: A tuple (chars, length)\n",
-        "    colorbot: An object of type RnnColorbot\n",
-        "    batch_size: Int, the batch size to use\n",
-        "    training: Boolean, whether the layer is used for training\n",
-        "    \n",
-        "  Returns:\n",
-        "    A Tensor of shape (batch_size, 3) - the model predictions.\n",
-        "  \"\"\"\n",
-        "  (chars, length) = inputs\n",
-        "  seq = tf.transpose(chars, [1, 0, 2])\n",
-        "  seq.set_shape((None, batch_size, 256))\n",
+        "    Returns:\n",
+        "      A Tensor of shape (batch_size, 3) - the model predictions.\n",
+        "    \"\"\"\n",
+        "    chars, length = inputs\n",
+        "    batch_size = chars.shape[0]\n",
+        "    seq = tf.transpose(chars, (1, 0, 2))\n",
         "\n",
-        "  seq = rnn_layer(seq, colorbot.lower_cell, batch_size, training)\n",
-        "  seq = rnn_layer(seq, colorbot.upper_cell, batch_size, training)\n",
+        "    seq = self._rnn_layer(seq, self.lower_cell, batch_size, training)\n",
+        "    seq = self._rnn_layer(seq, self.upper_cell, batch_size, training)\n",
         "\n",
-        "  # Grab just the end-of-sequence from each output.\n",
-        "  indices = tf.stack([length - 1, range(batch_size)], axis=1)\n",
-        "  sequence_ends = tf.gather_nd(seq, indices)\n",
-        "  return colorbot.relu_layer(sequence_ends)\n",
+        "    # Grab just the end-of-sequence from each output.\n",
+        "    indices = tf.stack([length - 1, range(batch_size)], axis=1)\n",
+        "    sequence_ends = tf.gather_nd(seq, indices)\n",
+        "    return self.relu_layer(sequence_ends)\n",
         "\n",
         "@autograph.convert()\n",
         "def loss_fn(labels, predictions):\n",
@@ -246,9 +262,9 @@
         }
       },
       "source": [
-        "We will now create the model function for the estimator.\n",
+        "We will now create the model function for the custom Estimator.\n",
         "\n",
-        "In the model function, we simply call the converted functions that we defined above - that's it!"
+        "In the model function, we simply use the model class we defined above - that's it!"
       ]
     },
     {
@@ -275,14 +291,12 @@
         "  sequence_length = features['sequence_length']\n",
         "  inputs = (chars, sequence_length)\n",
         "\n",
-        "  # Create the model components.\n",
-        "  # Simply calling the AutoGraph-ed functions and objects just works!\n",
+        "  # Create the model. Simply using the AutoGraph-ed class just works!\n",
         "  colorbot = RnnColorbot()\n",
-        "  \n",
-        "  batch_size = params['batch_size']\n",
+        "  colorbot.build(None)\n",
         "\n",
         "  if mode == tf.estimator.ModeKeys.TRAIN:\n",
-        "    predictions = model(inputs, colorbot, batch_size, training=True)\n",
+        "    predictions = colorbot(inputs, training=True)\n",
         "    loss = loss_fn(labels, predictions)\n",
         "\n",
         "    learning_rate = params['learning_rate']\n",
@@ -292,14 +306,13 @@
         "    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)\n",
         "\n",
         "  elif mode == tf.estimator.ModeKeys.EVAL:\n",
-        "    predictions = model(inputs, colorbot, batch_size, training=False)\n",
+        "    predictions = colorbot(inputs)\n",
         "    loss = loss_fn(labels, predictions)\n",
         "\n",
         "    return tf.estimator.EstimatorSpec(mode, loss=loss)\n",
-        "  \n",
+        "\n",
         "  elif mode == tf.estimator.ModeKeys.PREDICT:\n",
-        "    # For prediction, we expect single tensors.\n",
-        "    predictions = model(inputs, colorbot, 1, training=False)\n",
+        "    predictions = colorbot(inputs)\n",
         "\n",
         "    predictions = tf.minimum(predictions, 1.0)\n",
         "    return tf.estimator.EstimatorSpec(mode, predictions=predictions)"
@@ -368,7 +381,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 7,
       "metadata": {
         "colab": {
           "autoexec": {
@@ -379,9 +392,9 @@
         },
         "colab_type": "code",
         "executionInfo": {
-          "elapsed": 10064,
+          "elapsed": 10604,
           "status": "ok",
-          "timestamp": 1523580419240,
+          "timestamp": 1524095272039,
           "user": {
             "displayName": "",
             "photoUrl": "",
@@ -390,7 +403,7 @@
           "user_tz": 240
         },
         "id": "2pg1AfbxBJQq",
-        "outputId": "41894b16-3d3a-4e30-f6e4-5a9c837a2210",
+        "outputId": "9c924b4f-06e1-4538-976c-a3e1ddac5660",
         "slideshow": {
           "slide_type": "-"
         }
@@ -400,7 +413,7 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "Eval loss at step 100: 0.0665446\n"
+            "Eval loss at step 100: 0.0674834\n"
           ]
         }
       ],
@@ -444,7 +457,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 8,
       "metadata": {
         "colab": {
           "autoexec": {
@@ -455,9 +468,9 @@
         },
         "colab_type": "code",
         "executionInfo": {
-          "elapsed": 31286,
+          "elapsed": 7990,
           "status": "ok",
-          "timestamp": 1523580450579,
+          "timestamp": 1524095280105,
           "user": {
             "displayName": "",
             "photoUrl": "",
@@ -466,7 +479,7 @@
           "user_tz": 240
         },
         "id": "dxHex2tUN_10",
-        "outputId": "b3dc558d-b800-4e9b-e60e-3441124e80d8",
+        "outputId": "2b889e5a-b9ed-4645-bf03-d98f26c72101",
         "slideshow": {
           "slide_type": "slide"
         }
@@ -478,7 +491,7 @@
               "\u003clink rel=stylesheet type=text/css href='/nbextensions/google.colab/tabbar.css'\u003e\u003c/link\u003e"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.HTML at 0x7f4112527e90\u003e"
+              "\u003cIPython.core.display.HTML at 0x7f3f36aa6cd0\u003e"
             ]
           },
           "metadata": {
@@ -494,7 +507,7 @@
               "\u003cscript src='/nbextensions/google.colab/tabbar_main.min.js'\u003e\u003c/script\u003e"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.HTML at 0x7f4112527f10\u003e"
+              "\u003cIPython.core.display.HTML at 0x7f3eca67f7d0\u003e"
             ]
           },
           "metadata": {
@@ -510,7 +523,7 @@
               "\u003cdiv id=\"id1\"\u003e\u003c/div\u003e"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.HTML at 0x7f4112527f50\u003e"
+              "\u003cIPython.core.display.HTML at 0x7f3eca67f8d0\u003e"
             ]
           },
           "metadata": {
@@ -523,11 +536,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"2c60f474-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = colab_lib.createTabBar({\"initialSelection\": 0, \"location\": \"top\", \"contentHeight\": [\"initial\"], \"borderColor\": [\"#a7a7a7\"], \"contentBorder\": [\"0px\"], \"tabNames\": [\"RNN Colorbot\"], \"elementId\": \"id1\"});\n",
-              "//# sourceURL=js_a0db480422"
+              "window[\"e8ddfa22-4362-11e8-91ec-c8d3ffb5fbe0\"] = colab_lib.createTabBar({\"contentBorder\": [\"0px\"], \"elementId\": \"id1\", \"borderColor\": [\"#a7a7a7\"], \"contentHeight\": [\"initial\"], \"tabNames\": [\"RNN Colorbot\"], \"location\": \"top\", \"initialSelection\": 0});\n",
+              "//# sourceURL=js_71b9087b6d"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd1d0\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67f950\u003e"
             ]
           },
           "metadata": {
@@ -540,11 +553,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"2c60f475-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
-              "//# sourceURL=js_d2a46ea291"
+              "window[\"e8ddfa23-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_e390445f33"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd0d0\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67f990\u003e"
             ]
           },
           "metadata": {
@@ -557,11 +570,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"2c60f476-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
-              "//# sourceURL=js_0a8262c6e9"
+              "window[\"e8ddfa24-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_241dd76d85"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd390\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fc50\u003e"
             ]
           },
           "metadata": {
@@ -575,11 +588,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"2c60f477-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
-              "//# sourceURL=js_e32f85ccd2"
+              "window[\"e8ddfa25-4362-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_60c64e3d50"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd490\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fd90\u003e"
             ]
           },
           "metadata": {
@@ -593,11 +606,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"2c60f478-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"2c60f477-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_eaee748b21"
+              "window[\"e8ddfa26-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"e8ddfa25-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_14ea437cbd"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd550\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fe10\u003e"
             ]
           },
           "metadata": {
@@ -611,11 +624,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"2c60f479-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
-              "//# sourceURL=js_2befe06587"
+              "window[\"e8ddfa27-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_09294c2226"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f4112527f10\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fcd0\u003e"
             ]
           },
           "metadata": {
@@ -629,11 +642,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"354d7b1a-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"2c60f476-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_8ec4aeeb25"
+              "window[\"ec965514-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"e8ddfa24-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_e5e8266997"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd690\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fe10\u003e"
             ]
           },
           "metadata": {
@@ -647,11 +660,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"354d7b1b-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
-              "//# sourceURL=js_9f9f4574f1"
+              "window[\"ec965515-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_07a097f0ee"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd350\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fc90\u003e"
             ]
           },
           "metadata": {
@@ -665,11 +678,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"354d7b1c-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
-              "//# sourceURL=js_bcccd8f300"
+              "window[\"ec965516-4362-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_790d669ca8"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd6d0\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67f8d0\u003e"
             ]
           },
           "metadata": {
@@ -683,11 +696,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"354d7b1d-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b1c-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_2c056cee72"
+              "window[\"ec965517-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"ec965516-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_d30df771f0"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd490\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fd90\u003e"
             ]
           },
           "metadata": {
@@ -701,11 +714,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"354d7b1e-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
-              "//# sourceURL=js_c853c3f58b"
+              "window[\"ec965518-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_8a43a2da4b"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd610\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fc50\u003e"
             ]
           },
           "metadata": {
@@ -718,369 +731,9 @@
         },
         {
           "data": {
-            "application/javascript": [
-              "window[\"354d7b1f-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b1b-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_e5730ab00d"
-            ],
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQwAAAENCAYAAAD60Fs2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAACMBJREFUeJzt3F+I1XX+x/G32zjiFERUpgaFd2JBzOg5joX4h0SiMgmM\n/uhVGIlgFBlERGB3hUEkhkRdtDfRP1ACL6KpLBqcguxCjEAkmGamQcSohFHzsxe7O6zssvsydtff\n+ns8rs758j3f8z7fiyef7/k3o7XWCiDwh4s9APC/QzCAmGAAMcEAYoIBxAQDiAkGF8XTTz9d3W63\n7rvvvhoZGakVK1Zc7JEICMYlbvXq1TU8PHyxxzjPV199VcPDw/XZZ5/V22+/XVVVM2bMuMhTkRAM\n/qt+++23+uGHH+r666+vWbNmXexxuECCcQl76qmnanx8vLZs2VIDAwP1+uuv1zfffFP3339/dTqd\nWr9+fY2MjEzvv2nTpnr55ZfrgQceqIGBgXr44Yfr5MmTVVV1+vTp2r59ey1durQ6nU5t2LChTpw4\nUVVVk5OTtWXLllq6dGmtXbu23nnnnelj7tq1q7Zt21bbt2+vJUuW1HvvvVfPPvtsHTp0qAYGBmrX\nrl1/N/fRo0dr06ZN1el06u67766hoaGqqhodHa1OpzO93zPPPFO33nrr9P3t27fXm2+++e89iZyv\ncUlbtWpVGx4ebq21NjEx0brdbjtw4EBrrbUvvviidbvdduLEidZaaxs3bmxr1qxp33//fZuammob\nN25sO3fubK219tZbb7VHH320TU1NtXPnzrXDhw+3X375pbXW2kMPPdR27NjRTp8+3Y4cOdIGBwen\nn/OVV15pN910U/voo49aa61NTU21999/vz344IPTMx48eLCtWLGitdbamTNn2po1a9qePXvamTNn\n2vDwcOvv72/Hjh2bfj2HDx9urbW2du3advvtt7ejR4+21lpbuXJlO3LkyH/qVNJas8L4f6D95edC\n+/btq5UrV9by5curqmrZsmV1880316effjq977333ls33HBD9fb21h133FFHjhypqqqenp46efJk\nHTt2rGbMmFGLFi2qyy+/vCYmJurrr7+uJ598smbOnFkLFy6sDRs21N69e6eP2d/fX6tXr66qqt7e\n3n8666FDh+rUqVP1yCOPVE9PTw0ODtaqVavqgw8+qKqqJUuW1MjISB0/fryqqtauXVtffvlljY6O\n1q+//loLFy78N501/pGeiz0A/z1jY2O1f//++vjjj6vqzyE5e/ZsLVu2bHqfa665Zvr27Nmz69Sp\nU1VVdc8999TExEQ98cQT9fPPP9e6devq8ccfr8nJybryyitr9uzZ04+bP39+HT58ePr+3Llz4xkn\nJydr3rx5522bP39+TU5OVlVVp9OpoaGhuu6666rb7Va32629e/dWb29vLV68+ALOBr+HYFzi/vbT\nh3nz5tX69etrx44dF3ycnp6e2rp1a23durXGxsZq8+bNtWDBgrrtttvqp59+qlOnTlVfX19VVY2P\nj9ecOXP+4Qz/ypw5c2p8fPy8bWNjY7VgwYKqqup2u/Xiiy/WvHnzqtPp1MDAQD333HPV29tb3W73\ngl8XF8YlySXu2muvrdHR0aqqWrduXQ0NDdXnn39e586dq6mpqRoZGakff/zxXx7n4MGD9d1339W5\nc+eqr6+venp66rLLLqu5c+dWf39/vfTSS3X69On69ttv6913361169b9rnlvueWW6uvrq9dee63O\nnj1bBw8erE8++aTuvPPOqqq68cYba9asWbVv377qdDp1xRVX1NVXX10ffvjheW+I8p8hGJe4zZs3\n1+7du6vb7db+/ftr9+7dtWfPnlq2bFmtWrWq3njjjen3OP7ZSuD48eO1bdu2Wrx4cd111121dOnS\n6Sjs3LmzRkdHa/ny5bVt27Z67LHHzrvMuRAzZ86sV199tQ4cOFCDg4P1/PPP1wsvvDC9wqj68yrj\nqquumr7U+WsoFi1a9Luek9yM1vyBDpCxwgBiggHEBAOICQYQ+z/7PYzjf/QRGVxM12z68u+2WWEA\nMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHE\nBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhAT\nDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEww\ngJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEA\nYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOI\nCQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAm\nGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhg\nADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIB\nxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQ\nEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBM\nMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHB\nAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQD\niAkGEBMMIDajtdYu9hDA/wYrDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEA\nYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4j9CY2LTAbbRbWuAAAAAElFTkSuQmCC\n",
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41127a2050\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"354d7b20-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
-              "//# sourceURL=js_a897ef7e24"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41127a2250\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"354d7b21-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
-              "//# sourceURL=js_565fa3d154"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f4113124d90\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"354d7b22-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b21-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_222e0dc6af"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f4113124c10\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"354d7b23-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
-              "//# sourceURL=js_831db7458f"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f4113124310\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3803fab4-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b20-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_adb576c6eb"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f990850\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3803fab5-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
-              "//# sourceURL=js_9418f2d32f"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f990850\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3803fab6-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
-              "//# sourceURL=js_3fad25f306"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f4112527ed0\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3803fab7-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803fab6-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_45b9340e7b"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f990c90\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3803fab8-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
-              "//# sourceURL=js_bec9896d44"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f990a10\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3803fab9-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803fab5-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_460b91ad4a"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41b21d3a10\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3803faba-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
-              "//# sourceURL=js_7dedd0b037"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41b21d3890\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3803fabb-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
-              "//# sourceURL=js_4b1c977dc7"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41b21d3bd0\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3803fabc-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803fabb-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_d64fedfcf9"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41b21d3410\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3803fabd-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
-              "//# sourceURL=js_3e8c929c3f"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41b21d3c50\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3b9b986c-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803faba-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_9f9cf2b76f"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd590\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3b9b986d-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
-              "//# sourceURL=js_b402e6b587"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41b21d3d90\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3b9b986e-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
-              "//# sourceURL=js_9b7d66db72"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41b21d3b10\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3b9b986f-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b986e-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_11ec213a3f"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41b21d3950\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3b9b9870-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
-              "//# sourceURL=js_9c055e4bc0"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41b21d3850\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQwAAAENCAYAAAD60Fs2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAACMRJREFUeJzt3F+IlfW+x/Gvp3FECyIqU4PCO7EgZnQtnUJ0JJGoTDoY\n/dGrMBJhosggIgK7KwwiMdxRF11F/0AJvIisLBqcguxCjEAkmNQGcRvVwIzm71zsc4Yje7P3x9h7\nz97u1+tqrYdnPeu7nos3v2f9m9FaawUQ+K/pHgD49yEYQEwwgJhgADHBAGKCAcQEg2nx9NNPV7fb\nrfvuu69GRkZq5cqV0z0SAcG4xK1evbqGh4ene4wLfPXVVzU8PFyfffZZvf3221VVNWPGjGmeioRg\n8E/122+/1Q8//FDXX399zZo1a7rH4SIJxiXsqaeeqhMnTtSWLVuqv7+/Xn/99frmm2/q/vvvr06n\nU+vXr6+RkZGp/Tdt2lQvv/xyPfDAA9Xf318PP/xwnTlzpqqqJicna9u2bbVs2bLqdDq1YcOGOn36\ndFVVjY2N1ZYtW2rZsmW1du3aeuedd6aOuXPnzhoaGqpt27bV0qVL67333qtnn322Dh06VP39/bVz\n584/m/vo0aO1adOm6nQ6dffdd9f+/furqmp0dLQ6nc7Ufs8880zdeuutU/e3bdtWb7755t/3JHKh\nxiVtcHCwDQ8Pt9ZaO3nyZOt2u+3AgQOttda++OKL1u122+nTp1trrW3cuLGtWbOmff/9921iYqJt\n3Lix7dixo7XW2ltvvdUeffTRNjEx0c6fP98OHz7cfvnll9Zaaw899FDbvn17m5ycbEeOHGnLly+f\nes5XXnml3XTTTe2jjz5qrbU2MTHR3n///fbggw9OzXjw4MG2cuXK1lprZ8+ebWvWrGm7d+9uZ8+e\nbcPDw62vr68dO3Zs6vUcPny4tdba2rVr2+23396OHj3aWmtt1apV7ciRI/+oU0lrzQrjP0D7358L\n7d27t1atWlUrVqyoqqqBgYG6+eab69NPP53a9957760bbrihent764477qgjR45UVVVPT0+dOXOm\njh07VjNmzKjFixfX5ZdfXidPnqyvv/66nnzyyZo5c2YtWrSoNmzYUHv27Jk6Zl9fX61evbqqqnp7\ne//qrIcOHarx8fF65JFHqqenp5YvX16Dg4P1wQcfVFXV0qVLa2RkpE6dOlVVVWvXrq0vv/yyRkdH\n69dff61Fixb9nc4af0nPdA/AP8/x48dr37599fHHH1fVn0Jy7ty5GhgYmNrnmmuumbo9e/bsGh8f\nr6qqe+65p06ePFlPPPFE/fzzz7Vu3bp6/PHHa2xsrK688sqaPXv21OMWLFhQhw8fnro/b968eMax\nsbGaP3/+BdsWLFhQY2NjVVXV6XRq//79dd1111W3261ut1t79uyp3t7eWrJkyUWcDX4PwbjE/f9P\nH+bPn1/r16+v7du3X/Rxenp6auvWrbV169Y6fvx4bd68uRYuXFi33XZb/fTTTzU+Pl5z5sypqqoT\nJ07U3Llz/+IMf8vcuXPrxIkTF2w7fvx4LVy4sKqqut1uvfjiizV//vzqdDrV399fzz33XPX29la3\n273o18XFcUlyibv22mtrdHS0qqrWrVtX+/fvr88//7zOnz9fExMTNTIyUj/++OPfPM7Bgwfru+++\nq/Pnz9ecOXOqp6enLrvsspo3b1719fXVSy+9VJOTk/Xtt9/Wu+++W+vWrftd895yyy01Z86ceu21\n1+rcuXN18ODB+uSTT+rOO++sqqobb7yxZs2aVXv37q1Op1NXXHFFXX311fXhhx9e8IYo/xiCcYnb\nvHlz7dq1q7rdbu3bt6927dpVu3fvroGBgRocHKw33nhj6j2Ov7YSOHXqVA0NDdWSJUvqrrvuqmXL\nlk1FYceOHTU6OlorVqyooaGheuyxxy64zLkYM2fOrFdffbUOHDhQy5cvr+eff75eeOGFqRVG1Z9W\nGVddddXUpc7/hWLx4sW/6znJzWjNH+gAGSsMICYYQEwwgJhgALF/2e9h/PEP/z3dI8B/tKseee/P\ntllhADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEA\nYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOI\nCQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAm\nGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhg\nADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIB\nxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQ\nEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBM\nMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHB\nAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQD\niAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwg\nJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICY\nYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKC\nAcQEA4gJBhATDCA2o7XWpnsI4N+DFQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEww\ngJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHE/gfh60wGjfc7LQAAAABJRU5ErkJg\ngg==\n",
-            "text/plain": [
-              "\u003cmatplotlib.figure.Figure at 0x7f4113124310\u003e"
+              "\u003cmatplotlib.figure.Figure at 0x7f3ecc00bf10\u003e"
             ]
           },
           "metadata": {
@@ -1095,11 +748,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3b9b9871-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b986d-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_ba6a061307"
+              "window[\"ec965519-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"ec965515-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_893ad561f4"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd890\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b55c90\u003e"
             ]
           },
           "metadata": {
@@ -1113,11 +766,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3b9b9872-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
-              "//# sourceURL=js_83e3496927"
+              "window[\"ec96551a-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_2d99e0ac17"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd590\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fe50\u003e"
             ]
           },
           "metadata": {
@@ -1131,11 +784,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3b9b9873-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
-              "//# sourceURL=js_f437bab20d"
+              "window[\"ec96551b-4362-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_5c19462e32"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41127a22d0\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b55dd0\u003e"
             ]
           },
           "metadata": {
@@ -1149,11 +802,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3b9b9874-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b9873-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_93aa63450e"
+              "window[\"ec96551c-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"ec96551b-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_b9c8b7567b"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41127a2b90\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b55a50\u003e"
             ]
           },
           "metadata": {
@@ -1167,11 +820,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3b9b9875-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
-              "//# sourceURL=js_aca189bea5"
+              "window[\"ec96551d-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_fd05186348"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd4d0\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b55810\u003e"
             ]
           },
           "metadata": {
@@ -1185,10 +838,10 @@
         {
           "data": {
             "text/html": [
-              "\u003cdiv class=id_100313201 style=\"margin-right:10px; display:flex;align-items:center;\"\u003e\u003cspan style=\"margin-right: 3px;\"\u003e\u003c/span\u003e\u003c/div\u003e"
+              "\u003cdiv class=id_888646481 style=\"margin-right:10px; display:flex;align-items:center;\"\u003e\u003cspan style=\"margin-right: 3px;\"\u003e\u003c/span\u003e\u003c/div\u003e"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.HTML at 0x7f410f990a90\u003e"
+              "\u003cIPython.core.display.HTML at 0x7f3f32414810\u003e"
             ]
           },
           "metadata": {
@@ -1203,11 +856,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3b9b9876-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_100313201 span\");\n",
-              "//# sourceURL=js_5df1fe383e"
+              "window[\"ec96551e-4362-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_888646481 span\");\n",
+              "//# sourceURL=js_efef96e882"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd490\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b55710\u003e"
             ]
           },
           "metadata": {
@@ -1222,11 +875,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3b9b9877-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"3b9b9876-3eb4-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n",
-              "//# sourceURL=js_c62c7174ad"
+              "window[\"ec96551f-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"ec96551e-4362-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n",
+              "//# sourceURL=js_6eca889864"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41127a2390\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67f990\u003e"
             ]
           },
           "metadata": {
@@ -1241,11 +894,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3ed76584-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_100313201 input\");\n",
-              "//# sourceURL=js_2e2201ddc4"
+              "window[\"ed8ea972-4362-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_888646481 input\");\n",
+              "//# sourceURL=js_f02070cc60"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41127a2810\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b553d0\u003e"
             ]
           },
           "metadata": {
@@ -1260,11 +913,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3ed76585-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"3ed76584-3eb4-11e8-91ec-c8d3ffb5fbe0\"].remove();\n",
-              "//# sourceURL=js_288e5283d6"
+              "window[\"ed8ea973-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"ed8ea972-4362-11e8-91ec-c8d3ffb5fbe0\"].remove();\n",
+              "//# sourceURL=js_ed9faba660"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41127a26d0\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3f31a95450\u003e"
             ]
           },
           "metadata": {
@@ -1279,11 +932,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3ed76586-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_100313201 span\");\n",
-              "//# sourceURL=js_2f31d19cde"
+              "window[\"ed8ea974-4362-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_888646481 span\");\n",
+              "//# sourceURL=js_f3458d7074"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41127a2fd0\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3f31a95250\u003e"
             ]
           },
           "metadata": {
@@ -1298,11 +951,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3ed76587-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"3ed76586-3eb4-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n",
-              "//# sourceURL=js_2fbbcda050"
+              "window[\"ed8ea975-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"ed8ea974-4362-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n",
+              "//# sourceURL=js_3ffd97bd6f"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f4112527e90\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3f31a953d0\u003e"
             ]
           },
           "metadata": {
@@ -1317,11 +970,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3ed76588-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b9872-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_f94d975cf3"
+              "window[\"ed8ea976-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"ec96551a-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_7f73e8bcca"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41127a2fd0\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b55710\u003e"
             ]
           },
           "metadata": {
@@ -1337,7 +990,7 @@
         "def predict_input_fn(color_name):\n",
         "  \"\"\"An input function for prediction.\"\"\"\n",
         "  _, chars, sequence_length = parse(color_name)\n",
-        "  \n",
+        "\n",
         "  # We create a batch of a single element.\n",
         "  features = {\n",
         "      'chars': tf.expand_dims(chars, 0),\n",
@@ -1385,7 +1038,11 @@
     "colab": {
       "collapsed_sections": [],
       "default_view": {},
-      "name": "RNN Colorbot using Estimators",
+      "last_runtime": {
+        "build_target": "",
+        "kind": "local"
+      },
+      "name": "RNN Colorbot using Keras and Estimators",
       "provenance": [
         {
           "file_id": "1CtzefX39ffFibX_BqE6cRbT0UW_DdVKl",
-- 
GitLab


From b4c37a452d2ed1d1c29ceb70127c4ef6434c44ca Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 07:13:03 -0700
Subject: [PATCH 138/450] Teach the conditinal simplifier about sharding.

PiperOrigin-RevId: 193510638
---
 tensorflow/compiler/xla/service/conditional_simplifier.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.cc b/tensorflow/compiler/xla/service/conditional_simplifier.cc
index f35de08085..e560abc87f 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier.cc
@@ -69,7 +69,7 @@ static StatusOr<bool> TryRemoveConditional(HloInstruction* conditional) {
         conditional->shape(), {conditional->mutable_operand(2)},
         conditional->false_computation()));
   }
-
+  conditional->SetupDerivedInstruction(call_op);
   TF_RETURN_IF_ERROR(computation->ReplaceInstruction(conditional, call_op));
   TF_RETURN_IF_ERROR(CallInliner::Inline(call_op).status());
 
-- 
GitLab


From 1a2eb108a3e513a4f4609b9d421277bc222e5eb0 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 19 Apr 2018 15:03:05 +0000
Subject: [PATCH 139/450] Update docs for tf.unstack with respect to numpy.

In 18692 an issue was raised over whether tf.unstack
is compatible with numpy.unstack (specified in current docs)
or numpy.split.

It looks like there is no numpy.unstack. And for numpy.split,
it is not compatible with tf.unstack.

The tf.split is very close to numpy.split. However, the second
arg `num_or_size_splits` in `tf.split` requires the number of
the splits, while the second arg `indices_or_sections` in
`numpy.split` requires the index of the splits. For that reason
the tf.split is not compatible with numpy.split as well.

According to the above this fix simply removes `The numpy equivalent` part
in the docs of tf.unstack.

This fix fixes 18692.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/array_ops.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index ceeabe090d..23202ae28e 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1057,9 +1057,7 @@ def unstack(value, num=None, axis=0, name="unstack"):
     `value[:, i, :, :]` and each tensor in `output` will have shape `(A, C, D)`.
   Etc.
 
-  This is the opposite of stack.  The numpy equivalent is
-
-      tf.unstack(x, n) = np.unstack(x)
+  This is the opposite of stack.
 
   Args:
     value: A rank `R > 0` `Tensor` to be unstacked.
-- 
GitLab


From 7e735e5be811bacfa4e16aeae2e8aa53ef209ea6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 09:13:47 -0700
Subject: [PATCH 140/450] Pin pip to version 9.0.3

* This is because pip 10 is still unstable in some distros
* reference: https://github.com/pypa/pip/issues/5240

PiperOrigin-RevId: 193525542
---
 tensorflow/tools/ci_build/install/install_pip_packages.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index fc137aeeed..9644277fab 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -19,11 +19,11 @@ set -e
 # We don't apt-get install so that we can install a newer version of pip.
 # Only needed for Ubuntu 14.04 ,and not needed for Ubuntu 16.04 / Debian 8,9
 if $(cat /etc/*-release | grep -q 14.04); then
-  easy_install -U pip
-  easy_install3 -U pip
+  easy_install -U pip==9.0.3
+  easy_install3 -U pip==9.0.3
 else
-  pip2 install --upgrade pip
-  pip3 install --upgrade pip
+  pip2 install --upgrade pip==9.0.3
+  pip3 install --upgrade pip==9.0.3
 fi
 
 # Install pip packages from whl files to avoid the time-consuming process of
-- 
GitLab


From 51a26bb2f3e66fc79a5870f6eed88f60de995d4a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 09:23:35 -0700
Subject: [PATCH 141/450] [TF:XLA] Change HloTestBase::ExecuteNoHloPasses to
 return a literal directly.

PiperOrigin-RevId: 193526900
---
 tensorflow/compiler/xla/tests/hlo_test_base.cc | 8 +++++---
 tensorflow/compiler/xla/tests/hlo_test_base.h  | 2 +-
 tensorflow/compiler/xla/tests/tuple_test.cc    | 3 +--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index c5afe0c3e0..9984aba089 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -113,11 +113,13 @@ StatusOr<std::unique_ptr<Literal>> HloTestBase::Execute(
   return test_runner_.Execute(std::move(module), arguments);
 }
 
-StatusOr<std::unique_ptr<Literal>> HloTestBase::ExecuteNoHloPasses(
+std::unique_ptr<Literal> HloTestBase::ExecuteNoHloPasses(
     std::unique_ptr<HloModule> module,
     tensorflow::gtl::ArraySlice<Literal*> arguments) {
-  return test_runner_.Execute(std::move(module), arguments,
-                              /*run_hlo_passes=*/false);
+  return test_runner_
+      .Execute(std::move(module), arguments,
+               /*run_hlo_passes=*/false)
+      .ValueOrDie();
 }
 
 std::unique_ptr<Literal> HloTestBase::ExecuteAndTransfer(
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 28d7ab09cb..79fcea9403 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -99,7 +99,7 @@ class HloTestBase : public ::testing::Test {
 
   // Same as above, except the module will be executed without running any HLO
   // passes on it.
-  StatusOr<std::unique_ptr<Literal>> ExecuteNoHloPasses(
+  std::unique_ptr<Literal> ExecuteNoHloPasses(
       std::unique_ptr<HloModule> module,
       tensorflow::gtl::ArraySlice<Literal*> arguments);
 
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 098be6d7aa..61d0fa02ab 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -535,8 +535,7 @@ TEST_F(TupleHloTest,
       HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
           .ValueOrDie();
   auto param = Literal::MakeTupleOwned(Literal::CreateR1<float>({1, 2, 3}));
-  TF_ASSERT_OK_AND_ASSIGN(auto result,
-                          ExecuteNoHloPasses(std::move(module), {param.get()}));
+  auto result = ExecuteNoHloPasses(std::move(module), {param.get()});
   EXPECT_TRUE(LiteralTestUtil::Equal(
       *result,
       *Literal::MakeTupleOwned(Literal::CreateR2<float>({{1, 2, 3}}))));
-- 
GitLab


From 0b3950d67bcb07c11f87bd3c2da554017bff0674 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Fri, 20 Apr 2018 00:35:54 +0800
Subject: [PATCH 142/450] Fix code block rendering in several api definitions

---
 tensorflow/core/api_def/base_api/api_def_Pad.pbtxt        | 1 +
 tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt b/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt
index e45e2375eb..ee4aad7899 100644
--- a/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt
@@ -24,5 +24,6 @@ pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
                       [0, 0, 2, 2, 0, 0]
                       [0, 0, 0, 0, 0, 0]]
 ```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt
index b9e75caf02..37ac10dddb 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt
@@ -44,6 +44,7 @@ In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
 if T == qint8, out[i] -= (range(T) + 1) / 2.0
 ```
+
 here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
 
 *MIN_COMBINED Mode Example*
@@ -87,6 +88,7 @@ choosing to elide the lowest possible value for symmetry (e.g., output range is
 
 We first find the range of values in our tensor. The
 range we use is always centered on 0, so we find m such that
+
 ```c++
   m = max(abs(input_min), abs(input_max))
 ```
@@ -95,6 +97,7 @@ Our input tensor range is then `[-m, m]`.
 
 Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
 If T is signed, this is
+
 ```
   num_bits = sizeof(T) * 8
   [min_fixed, max_fixed] =
@@ -102,16 +105,19 @@ If T is signed, this is
 ```
 
 Otherwise, if T is unsigned, the fixed-point range is
+
 ```
   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
 ```
 
 From this we compute our scaling factor, s:
+
 ```c++
   s = (max_fixed - min_fixed) / (2 * m)
 ```
 
 Now we can quantize the elements of our tensor:
+
 ```c++
 result = round(input * s)
 ```
-- 
GitLab


From 1f1d7b88717847f590987ee40efbe970bb591275 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 09:34:24 -0700
Subject: [PATCH 143/450] Disable dlopen error of libneuralnetworks for
 non-Android platforms.

PiperOrigin-RevId: 193528346
---
 tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
index 85aca36874..ace4827d8c 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@@ -34,10 +34,13 @@ limitations under the License.
 inline void* loadLibrary(const char* name) {
   // TODO: change RTLD_LOCAL? Assumes there can be multiple instances of nn
   // api RT
-  void* handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
+  void* handle = nullptr;
+#ifdef __ANDROID__
+  handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
   if (handle == nullptr) {
     NNAPI_LOG("nnapi error: unable to open library %s", name);
   }
+#endif
   return handle;
 }
 
-- 
GitLab


From c173157bdc132460c6f424a9803221e74fc73f59 Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Thu, 19 Apr 2018 09:37:20 -0700
Subject: [PATCH 144/450] [tf.data] Add checkpointing support for
 MapAndBatchDataset.

PiperOrigin-RevId: 193528712
---
 .../kernel_tests/batch_dataset_op_test.py     |  31 ++
 .../kernels/data/map_and_batch_dataset_op.cc  | 277 +++++++++++++++++-
 2 files changed, 302 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index e1ec60d7c9..a4a0ce79b6 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -681,6 +681,37 @@ class UnbatchDatasetSerializationTest(
         num_outputs)
 
 
+class MapAndBatchDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def testSerializationCore(self):
+    range_size = 11
+    num_repeats = 2
+    batch_size = 5
+    total_outputs = range_size * num_repeats
+    num_outputs_drop_remainder = total_outputs // batch_size
+    num_outputs_keep_remainder = int(math.ceil(total_outputs / batch_size))
+    num_parallel_batches = 2
+
+    def build_ds(range_start, drop_remainder=False):
+
+      def _map_fn(x):
+        return math_ops.square(x)
+
+      return dataset_ops.Dataset.range(
+          range_start, range_start + range_size).repeat(num_repeats).apply(
+              batching.map_and_batch(
+                  map_func=_map_fn,
+                  batch_size=batch_size,
+                  num_parallel_batches=num_parallel_batches,
+                  drop_remainder=drop_remainder))
+
+    self.run_core_tests(lambda: build_ds(10), lambda: build_ds(15),
+                        num_outputs_keep_remainder)
+    self.run_core_tests(lambda: build_ds(10, True), lambda: build_ds(15, True),
+                        num_outputs_drop_remainder)
+
+
 class PaddedBatchDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index aaf4dc7341..b8105552a0 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -74,26 +74,29 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, CapturedFunction::Create(
                             func_, std::move(other_arguments), &captured_func));
 
-    *output = new Dataset(input, batch_size, num_parallel_batches,
-                          drop_remainder, output_types_, output_shapes_,
+    *output = new Dataset(ctx, input, batch_size, num_parallel_batches,
+                          drop_remainder, output_types_, output_shapes_, func_,
                           std::move(captured_func), &ctx->eigen_cpu_device());
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(const DatasetBase* input, int64 batch_size,
+    Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 batch_size,
             int64 num_parallel_batches, bool drop_remainder,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
+            const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func,
             const Eigen::ThreadPoolDevice* device)
-        : input_(input),
+        : GraphDatasetBase(ctx),
+          input_(input),
           batch_size_(batch_size),
           num_parallel_batches_(num_parallel_batches),
           drop_remainder_(drop_remainder),
           output_types_(output_types),
           output_shapes_(output_shapes),
+          map_fn_(func),
           captured_func_(std::move(captured_func)),
           device_(device) {
       input_->Ref();
@@ -117,6 +120,48 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() override { return "MapAndBatchDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, map_fn_.name()));
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      Node* batch_size_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size_node));
+      Node* num_parallel_batches_node;
+      TF_RETURN_IF_ERROR(
+          b->AddScalar(num_parallel_batches_, &num_parallel_batches_node));
+      Node* drop_remainder_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(drop_remainder_, &drop_remainder_node));
+
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f;
+      b->BuildAttrValue(map_fn_, &f);
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this,
+          {std::make_pair(0, input_graph_node),
+           std::make_pair(2, batch_size_node),
+           std::make_pair(3, num_parallel_batches_node),
+           std::make_pair(4, drop_remainder_node)},  // Single tensor inputs.
+          {std::make_pair(1, other_arguments)},      // Tensor list inputs.
+          {std::make_pair("f", f),
+           std::make_pair("Targuments", other_arguments_types_attr)},  // Attrs
+          output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -217,9 +262,83 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         return status;
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (current_batch_index_ == -1) {
+          // Iterator has not been used. Nothing to save.
+          return Status::OK();
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_batch_index"),
+                                               current_batch_index_));
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name("invocation_results_size"), invocation_results_.size()));
+        for (size_t i = 0; i < invocation_results_.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteInvocationResultLocked(writer, i));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("batch_results_size"),
+                                               batch_results_.size()));
+        for (size_t i = 0; i < batch_results_.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteBatchResultLocked(writer, i));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (!reader->Contains(full_name("current_batch_index"))) {
+          // Iterator was never used so nothing to restore.
+          return Status::OK();
+        }
+        {
+          int64 temp;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("current_batch_index"), &temp));
+          current_batch_index_ = static_cast<int32>(temp);
+          if (current_batch_index_ != temp) {
+            return errors::Internal("Invalid value for current_batch_index ",
+                                    temp);
+          }
+        }
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        size_t invocation_results_size;
+        {
+          int64 temp;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("invocation_results_size"), &temp));
+          invocation_results_size = static_cast<size_t>(temp);
+          if (invocation_results_size != temp) {
+            return errors::Internal(
+                "Invalid value for invocation_results_size ", temp);
+          }
+        }
+        CHECK_EQ(invocation_results_.size(), invocation_results_size);
+        for (size_t i = 0; i < invocation_results_size; ++i) {
+          TF_RETURN_IF_ERROR(ReadInvocationResultLocked(reader, i));
+        }
+        size_t batch_results_size;
+        {
+          int64 temp;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("batch_results_size"), &temp));
+          batch_results_size = static_cast<size_t>(temp);
+          if (batch_results_size != temp) {
+            return errors::Internal("Invalid value for batch_results_size ",
+                                    temp);
+          }
+        }
+        CHECK_EQ(batch_results_.size(), batch_results_size);
+        for (size_t i = 0; i < batch_results_size; ++i) {
+          TF_RETURN_IF_ERROR(ReadBatchResultLocked(reader, i));
+        }
+        return Status::OK();
+      }
+
      private:
       struct BatchResult {
-        mutex mu;
+        mutex mu ACQUIRED_AFTER(mu_);
         bool output_allocated GUARDED_BY(mu);
         std::vector<Tensor> output;
         std::unique_ptr<BlockingCounter> counter;
@@ -393,6 +512,151 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         return status;
       }
 
+      Status WriteInvocationResultLocked(IteratorStateWriter* writer,
+                                         size_t index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        const InvocationResult& result = invocation_results_[index];
+        string prefix = strings::StrCat("invocation_results_", index);
+        TF_RETURN_IF_ERROR(WriteStatusLocked(
+            writer, full_name(strings::StrCat(prefix, "_status")),
+            result.status));
+        if (result.end_of_input) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_end_of_input")), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_return_values_size")),
+            result.return_values.size()));
+        for (size_t i = 0; i < result.return_values.size(); i++) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat(prefix, "_return_values_", i)),
+              result.return_values[i]));
+        }
+        return Status::OK();
+      }
+
+      Status ReadInvocationResultLocked(IteratorStateReader* reader,
+                                        size_t index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        InvocationResult* result = &invocation_results_[index];
+        string prefix = strings::StrCat("invocation_results_", index);
+        TF_RETURN_IF_ERROR(ReadStatusLocked(
+            reader, full_name(strings::StrCat(prefix, "_status")),
+            &result->status));
+        result->end_of_input = reader->Contains(
+            full_name(strings::StrCat(prefix, "_end_of_input")));
+        size_t return_values_size;
+        {
+          int64 temp;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(prefix, "_return_values_size")),
+              &temp));
+          return_values_size = static_cast<size_t>(temp);
+          if (temp != return_values_size) {
+            return errors::Internal("Invalid value for return_values_size ",
+                                    return_values_size);
+          }
+        }
+        result->return_values.reserve(return_values_size);
+        for (size_t i = 0; i < return_values_size; i++) {
+          result->return_values.emplace_back();
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(strings::StrCat(prefix, "_return_values_", i)),
+              &result->return_values.back()));
+        }
+        return Status::OK();
+      }
+
+      Status WriteBatchResultLocked(IteratorStateWriter* writer, size_t index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        // Wait for the map_fn dispatches made in `InvokeFunctionLocked` to
+        // finish. This may delay saving a checkpoint by a bit but keeps the
+        // code clean and also saves us from checkpointing the state of the
+        // `BlockingCounter`.
+        batch_results_[index].counter->Wait();
+        const BatchResult& result = batch_results_[index];
+        string prefix = strings::StrCat("batch_results_", index);
+        {
+          mutex_lock l(batch_results_[index].mu);
+          if (result.output_allocated) {
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat(prefix, "_output_allocated")), ""));
+          }
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_output_size")),
+            result.output.size()));
+        for (size_t i = 0; i < result.output.size(); i++) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat(prefix, "_output_", i)),
+              result.output[i]));
+        }
+        return Status::OK();
+      }
+
+      Status ReadBatchResultLocked(IteratorStateReader* reader, size_t index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        BatchResult* result = &batch_results_[index];
+        string prefix = strings::StrCat("batch_results_", index);
+        {
+          mutex_lock l(batch_results_[index].mu);
+          result->output_allocated = reader->Contains(
+              full_name(strings::StrCat(prefix, "_output_allocated")));
+          // Simulate that the batch was fully generated.
+          batch_results_[index].counter.reset(new BlockingCounter(0));
+        }
+        size_t output_size;
+        {
+          int64 temp;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(prefix, "_output_size")), &temp));
+          output_size = static_cast<size_t>(temp);
+          if (temp != output_size) {
+            return errors::Internal("Invalid value for output_size ",
+                                    output_size);
+          }
+        }
+        result->output.reserve(output_size);
+        for (size_t i = 0; i < output_size; i++) {
+          result->output.emplace_back();
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(strings::StrCat(prefix, "_output_", i)),
+              &result->output.back()));
+        }
+        return Status::OK();
+      }
+
+      Status WriteStatusLocked(IteratorStateWriter* writer,
+                               const string& prefix, const Status& status)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
+                                static_cast<int64>(status.code())));
+        if (!status.ok()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
+                                  status.error_message()));
+        }
+        return Status::OK();
+      }
+
+      Status ReadStatusLocked(IteratorStateReader* reader, const string& prefix,
+                              Status* status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        int64 code_int;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(prefix, "_code")), &code_int));
+        error::Code code = static_cast<error::Code>(code_int);
+
+        if (code != error::Code::OK) {
+          string error_message;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(prefix, "_msg")), &error_message));
+          *status = Status(code, error_message);
+        } else {
+          *status = Status::OK();
+        }
+        return Status::OK();
+      }
       mutex mu_;
       int32 current_batch_index_ GUARDED_BY(mu_) = -1;
       const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
@@ -407,6 +671,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     const bool drop_remainder_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
+    const NameAttrList map_fn_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const Eigen::ThreadPoolDevice* device_;  // not owned
   };
-- 
GitLab


From 436f1434060d7f370baae9661baacc6cf27415ec Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Thu, 19 Apr 2018 09:54:40 -0700
Subject: [PATCH 145/450] Create a skeleton tf.contrib.checkpoint.

My plan for this is to incubate tools for working with object-based checkpoints:
  - Tools for managing dependency graphs, e.g. checkpointable lists/dictionaries
  - Inspecting/visualizing checkpoints
  - Listing variables and gathering initializers from a Checkpointable object
    and its dependencies
  - Verifying all variables are accessible as dependencies, which should make
    converting existing graph building Saver uses easier/safer.

This CL includes none of those things, it just moves the split_dependency tool
here instead of contrib/eager.

PiperOrigin-RevId: 193531292
---
 tensorflow/contrib/__init__.py                |  1 +
 tensorflow/contrib/checkpoint/README.md       |  2 +
 tensorflow/contrib/checkpoint/__init__.py     | 29 +++++++++++
 tensorflow/contrib/checkpoint/python/BUILD    | 29 +++++++++++
 .../python/split_dependency.py}               |  8 ++--
 .../python/split_dependency_test.py}          |  4 +-
 tensorflow/contrib/cmake/python_modules.txt   |  2 +
 tensorflow/contrib/cudnn_rnn/BUILD            |  2 +-
 .../cudnn_rnn/python/ops/cudnn_rnn_ops.py     |  4 +-
 tensorflow/contrib/eager/python/BUILD         | 48 ++-----------------
 tensorflow/contrib/optimizer_v2/BUILD         |  1 -
 tensorflow/tools/pip_package/BUILD            |  1 -
 12 files changed, 75 insertions(+), 56 deletions(-)
 create mode 100644 tensorflow/contrib/checkpoint/README.md
 create mode 100644 tensorflow/contrib/checkpoint/__init__.py
 create mode 100644 tensorflow/contrib/checkpoint/python/BUILD
 rename tensorflow/contrib/{eager/python/checkpointable_utils.py => checkpoint/python/split_dependency.py} (95%)
 rename tensorflow/contrib/{eager/python/checkpointable_utils_test.py => checkpoint/python/split_dependency_test.py} (96%)

diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 36cc5144d0..0d163daa6e 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -24,6 +24,7 @@ import os
 # Add projects here, they will show up under tf.contrib.
 from tensorflow.contrib import batching
 from tensorflow.contrib import bayesflow
+from tensorflow.contrib import checkpoint
 from tensorflow.contrib import cloud
 from tensorflow.contrib import cluster_resolver
 from tensorflow.contrib import coder
diff --git a/tensorflow/contrib/checkpoint/README.md b/tensorflow/contrib/checkpoint/README.md
new file mode 100644
index 0000000000..d35c5bae3b
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/README.md
@@ -0,0 +1,2 @@
+Tools for working with object-based checkpoints produced by
+`tf.train.Checkpoint`.
diff --git a/tensorflow/contrib/checkpoint/__init__.py b/tensorflow/contrib/checkpoint/__init__.py
new file mode 100644
index 0000000000..70d7d2d8d7
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tools for working with object-based checkpoints.
+
+
+For creating and managing dependencies:
+@@split_dependency
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.checkpoint.python.split_dependency import split_dependency
+from tensorflow.python.util.all_util import remove_undocumented
+
+remove_undocumented(module_name=__name__)
diff --git a/tensorflow/contrib/checkpoint/python/BUILD b/tensorflow/contrib/checkpoint/python/BUILD
new file mode 100644
index 0000000000..d57b01aab2
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/python/BUILD
@@ -0,0 +1,29 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "split_dependency",
+    srcs = ["split_dependency.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_test(
+    name = "split_dependency_test",
+    srcs = ["split_dependency_test.py"],
+    deps = [
+        ":split_dependency",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/eager:test",
+    ],
+)
diff --git a/tensorflow/contrib/eager/python/checkpointable_utils.py b/tensorflow/contrib/checkpoint/python/split_dependency.py
similarity index 95%
rename from tensorflow/contrib/eager/python/checkpointable_utils.py
rename to tensorflow/contrib/checkpoint/python/split_dependency.py
index 30c4103c5a..3aec8c96e9 100644
--- a/tensorflow/contrib/eager/python/checkpointable_utils.py
+++ b/tensorflow/contrib/checkpoint/python/split_dependency.py
@@ -1,4 +1,4 @@
-"""Utilities for working with Checkpointable objects."""
+"""Utility for creating multiple dependencies with synchronized save/restore."""
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,7 +20,7 @@ from __future__ import print_function
 import functools
 
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.training import checkpointable as core_checkpointable
+from tensorflow.python.training import checkpointable as checkpointable
 from tensorflow.python.training import saver as saver_lib
 
 
@@ -43,7 +43,7 @@ class _CallbackSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
     return self._restore_callback(tensor)
 
 
-class _SplitDependency(core_checkpointable.CheckpointableBase):
+class _SplitDependency(checkpointable.CheckpointableBase):
   """Looks like a regular variable while synchronizing save/restores."""
 
   def __init__(self, save_buffer, restore_buffer, name, dtype, num_components,
@@ -83,7 +83,7 @@ class _SplitDependency(core_checkpointable.CheckpointableBase):
   def _gather_saveables_for_checkpoint(self):
     """Looks to Checkpointable like a regular variable."""
     return {
-        core_checkpointable.VARIABLE_VALUE_KEY:
+        checkpointable.VARIABLE_VALUE_KEY:
         functools.partial(_CallbackSaveable,
                           dtype=self._dtype,
                           save_callback=self._save,
diff --git a/tensorflow/contrib/eager/python/checkpointable_utils_test.py b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
similarity index 96%
rename from tensorflow/contrib/eager/python/checkpointable_utils_test.py
rename to tensorflow/contrib/checkpoint/python/split_dependency_test.py
index da04199aaa..cb964c80e9 100644
--- a/tensorflow/contrib/eager/python/checkpointable_utils_test.py
+++ b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
@@ -18,7 +18,7 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.eager.python import checkpointable_utils as contrib_checkpointable_utils
+from tensorflow.contrib.checkpoint.python import split_dependency
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -47,7 +47,7 @@ class SaveTensorSlicesAsDeps(checkpointable.CheckpointableBase):
 
   def __init__(self):
     self.combined = resource_variable_ops.ResourceVariable([0., 0., 0., 0.])
-    split_dependencies = contrib_checkpointable_utils.split_dependency(
+    split_dependencies = split_dependency.split_dependency(
         component_names=("first_half", "second_half"),
         component_dtypes=(self.combined.dtype,) * 2,
         fill_save_buffer_fn=_split_variable_closure(
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 91839194c7..fbcdf7e753 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -130,6 +130,8 @@ tensorflow/contrib/boosted_trees/ops
 tensorflow/contrib/boosted_trees/proto
 tensorflow/contrib/boosted_trees/python
 tensorflow/contrib/boosted_trees/python/ops
+tensorflow/contrib/checkpoint
+tensorflow/contrib/checkpoint/python
 tensorflow/contrib/cloud
 tensorflow/contrib/cloud/kernels
 tensorflow/contrib/cloud/ops
diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index d68015ae15..aeefa3cee6 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -25,7 +25,7 @@ tf_custom_op_py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/eager/python:checkpointable_utils",
+        "//tensorflow/contrib/checkpoint/python:split_dependency",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index b615824460..a1ede4471e 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.eager.python import checkpointable_utils
+from tensorflow.contrib.checkpoint.python import split_dependency
 from tensorflow.contrib.rnn.python.ops import lstm_ops
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
@@ -318,7 +318,7 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
         dependencies too (typically the cuDNN `Layer`).
       dtype: The dtype for the canonical parameter Tensors.
     """
-    split_dependencies = checkpointable_utils.split_dependency(
+    split_dependencies = split_dependency.split_dependency(
         component_names=self._param_names,
         component_dtypes=(dtype,) * len(self._param_names),
         fill_save_buffer_fn=self._checkpointable_save,
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index e2744a430d..99abbae03f 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -11,7 +11,6 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        ":checkpointable_utils",
         ":datasets",
         ":metrics",
         ":network",
@@ -19,15 +18,14 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:numerics",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:script_ops",
         "//tensorflow/python:template",
+        "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:core",
         "//tensorflow/python/eager:execution_callbacks",
         "//tensorflow/python/eager:function",
     ],
@@ -70,7 +68,6 @@ cuda_py_test(
     srcs = ["datasets_test.py"],
     additional_deps = [
         ":datasets",
-        ":checkpointable_utils",
         "//tensorflow/contrib/data/python/ops:prefetching_ops",
         "//tensorflow/contrib/data/python/ops:threadpool",
         "//tensorflow/contrib/data/python/ops:unique",
@@ -79,6 +76,7 @@ cuda_py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python:training",
         "//tensorflow/python/data",
         "//tensorflow/python/eager:test",
     ],
@@ -121,8 +119,8 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/contrib/eager/python:checkpointable_utils",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:checkpointable",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
@@ -225,43 +223,3 @@ py_test(
         "//tensorflow/python/eager:test",
     ],
 )
-
-py_library(
-    name = "checkpointable_utils",
-    srcs = ["checkpointable_utils.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:training",
-    ],
-)
-
-cuda_py_test(
-    name = "checkpointable_utils_test",
-    srcs = ["checkpointable_utils_test.py"],
-    additional_deps = [
-        ":checkpointable_utils",
-        ":network",
-        "@six_archive//:six",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:layers_base",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:test",
-        "//tensorflow/python/keras",
-    ],
-    tags = [
-        "no_windows",  # TODO: needs investigation on Windows
-        "notsan",  # b/74395663
-    ],
-)
diff --git a/tensorflow/contrib/optimizer_v2/BUILD b/tensorflow/contrib/optimizer_v2/BUILD
index 85cfce346c..5225ecc14f 100644
--- a/tensorflow/contrib/optimizer_v2/BUILD
+++ b/tensorflow/contrib/optimizer_v2/BUILD
@@ -115,7 +115,6 @@ cuda_py_test(
     additional_deps = [
         ":training",
         "@six_archive//:six",
-        "//tensorflow/contrib/eager/python:checkpointable_utils",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 2ef105755f..0ac5a5bb6d 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -66,7 +66,6 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/data/python/kernel_tests:dataset_serialization_test",
     "//tensorflow/contrib/data/python/ops:contrib_op_loader",
     "//tensorflow/contrib/eager/python/examples:examples_pip",
-    "//tensorflow/contrib/eager/python:checkpointable_utils",
     "//tensorflow/contrib/eager/python:evaluator",
     "//tensorflow/contrib/gan:gan",
     "//tensorflow/contrib/graph_editor:graph_editor_pip",
-- 
GitLab


From 2273b62a769aa477f8d2ef02ca7dee253b8ea7b0 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Thu, 19 Apr 2018 10:05:08 -0700
Subject: [PATCH 146/450] Added support for concatenation and slicing of
 symbolic shapes

PiperOrigin-RevId: 193532769
---
 ...direct_session_with_tracking_alloc_test.cc |   4 +-
 tensorflow/core/framework/shape_inference.cc  |   2 +
 tensorflow/core/framework/shape_inference.h   |  12 +
 .../core/grappler/costs/graph_properties.cc   | 236 ++++++++++++++++--
 4 files changed, 235 insertions(+), 19 deletions(-)

diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index 31fb128f93..b4dd521bbc 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -102,9 +102,9 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
         EXPECT_EQ(2, shape.dim(0).size());
         EXPECT_EQ(1, shape.dim(1).size());
         if (node->name() == y->name()) {
-          EXPECT_EQ(3, cm->AllocationId(node, 0));
+          EXPECT_EQ(7, cm->AllocationId(node, 0));
         } else {
-          EXPECT_EQ(4, cm->AllocationId(node, 0));
+          EXPECT_EQ(8, cm->AllocationId(node, 0));
         }
       }
       EXPECT_LE(0, cm->MaxExecutionTime(node));
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 229b4a45fa..2b995e8b5e 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -157,8 +157,10 @@ InferenceContext::~InferenceContext() {}
 
 Status InferenceContext::Run(
     const std::function<Status(shape_inference::InferenceContext* c)>& fn) {
+  ForgetMerges();
   Status s = fn(this);
   if (!s.ok()) {
+    ForgetMerges();
     return AttachContext(s);
   }
 #ifndef NDEBUG
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index cdb4bd79bb..9431a62abe 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -285,6 +285,8 @@ class InferenceContext {
     return true;
   }
 
+  void SetInput(int idx, ShapeHandle shape) { inputs_[idx] = shape; }
+
   ShapeHandle input(int64 idx) const { return inputs_[idx]; }
   Status input(StringPiece input_name, std::vector<ShapeHandle>* output) const;
   int num_inputs() const { return inputs_.size(); }
@@ -317,6 +319,10 @@ class InferenceContext {
     input_tensors_as_shapes_ = input_tensors_as_shapes;
   }
 
+  const std::vector<ShapeHandle>& input_tensors_as_shapes() const {
+    return input_tensors_as_shapes_;
+  }
+
   ShapeHandle output(int64 idx) const { return outputs_[idx]; }
   void set_output(int idx, ShapeHandle shape) { outputs_[idx] = shape; }
   Status set_output(StringPiece output_name,
@@ -587,6 +593,12 @@ class InferenceContext {
       int idx,
       const std::vector<ShapeAndType>& shapes_and_types) TF_MUST_USE_RESULT;
 
+  void set_input_handle_shapes_and_types(
+      int idx, const std::vector<ShapeAndType>& shapes_and_types) {
+    input_handle_shapes_and_types_[idx].reset(
+        new std::vector<ShapeAndType>(shapes_and_types));
+  }
+
   // Returns the output handle shapes and types, for the resource tensor output
   // at index <idx>. Returns NULL if the shape and types were never set.
   const std::vector<ShapeAndType>* output_handle_shapes_and_types(int idx) {
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index a9c777e551..c83ddfe90a 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -18,8 +18,9 @@ limitations under the License.
 #include <queue>
 #include <unordered_map>
 #include <unordered_set>
-#include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/costs/utils.h"
 #include "tensorflow/core/grappler/utils.h"
@@ -394,15 +395,121 @@ class TopoQueue {
 // unknown shape/dimension of a given node.
 class SymbolicShapeRefiner {
  public:
-  explicit SymbolicShapeRefiner(ShapeRefiner* shape_refiner)
-      : shape_refiner_(shape_refiner) {}
+  explicit SymbolicShapeRefiner(const GraphDef& graph)
+      : function_library_(OpRegistry::Global(), graph.library()) {
+    graph_def_version_ = graph.versions().producer();
+    node_to_context_.reserve(graph.node_size());
+  }
 
   InferenceContext* GetContext(const Node* node) {
-    return shape_refiner_->GetContext(node);
+    auto it = node_to_context_.find(node);
+    if (it == node_to_context_.end()) {
+      return nullptr;
+    }
+    return it->second.inference_context.get();
   }
   Status UpdateNode(const Node* node, bool relax, bool* refined) {
-    return shape_refiner_->UpdateNode(node, relax, refined);
+    NodeContext* node_context = GetNodeContext(node);
+    if (node_context == nullptr) {
+      TF_RETURN_IF_ERROR(AddNode(node));
+      node_context = CHECK_NOTNULL(GetNodeContext(node));
+      *refined = true;
+    }
+    // Check if the shapes of the nodes in the fan-in of this node have changed,
+    // and if they have, update the node input shapes.
+    InferenceContext* inference_context = node_context->inference_context.get();
+    std::vector<Tensor> const_values(node->num_inputs());
+    std::vector<const Tensor*> input_tensors(node->num_inputs(), nullptr);
+    std::vector<ShapeHandle> input_tensors_as_shapes(node->num_inputs());
+
+    for (const Edge* e : node->in_edges()) {
+      if (e->IsControlEdge()) continue;
+
+      int dst_input = e->dst_input();
+      int src_output = e->src_output();
+
+      Node* input = e->src();
+      NodeContext* c = GetNodeContext(input);
+      if (c == nullptr) {
+        return errors::FailedPrecondition(
+            "Input ", dst_input, " ('", input->name(), "') for '", node->name(),
+            "' was not previously added to ShapeRefiner.");
+      }
+
+      if (input->IsConstant()) {
+        // Convert constant value into tensors.
+        if (const_values[dst_input].FromProto(
+                input->def().attr().at("value").tensor())) {
+          input_tensors[dst_input] = &const_values[dst_input];
+          // Integer tensors of rank one can also be interpreted as a shape
+          // provided all their values are >= -1.
+          if (const_values[dst_input].dims() == 1 &&
+              (const_values[dst_input].dtype() == DT_INT32 ||
+               const_values[dst_input].dtype() == DT_INT64)) {
+            ShapeHandle tensor_shape = inference_context->Vector(
+                const_values[dst_input].NumElements());
+            ShapeHandle shp;
+            if (inference_context
+                    ->MakeShapeFromTensor(input_tensors[dst_input],
+                                          tensor_shape, &shp)
+                    .ok()) {
+              input_tensors_as_shapes[dst_input] = shp;
+            }
+          }
+        }
+      }
+
+      if (c->output_tensors_as_shapes.size() > src_output) {
+        input_tensors_as_shapes[dst_input] =
+            c->output_tensors_as_shapes[src_output];
+      }
+
+      DCHECK_GE(dst_input, 0);
+      if (!*refined && !inference_context->input(dst_input).SameHandle(
+                           c->inference_context->output(src_output))) {
+        *refined = true;
+      }
+      inference_context->SetInput(dst_input,
+                                  c->inference_context->output(src_output));
+
+      if (!*refined &&
+          inference_context->requested_input_tensor_as_partial_shape(
+              dst_input)) {
+        // The input value may have changed. Since we have no way to know if
+        // that's indeed the case, err on the safe side.
+        *refined = true;
+      }
+
+      // Also propagate handle shape and dtype of edges which are carrying
+      // resource handles.
+      if (e->src()->output_type(src_output) == DT_RESOURCE) {
+        auto* outputs =
+            c->inference_context->output_handle_shapes_and_types(src_output);
+        if (!outputs) continue;
+        auto* inputs =
+            inference_context->input_handle_shapes_and_types(dst_input);
+
+        if (!inputs || !EquivalentShapesAndTypes(*outputs, *inputs)) {
+          *refined = true;
+        }
+        inference_context->set_input_handle_shapes_and_types(dst_input,
+                                                             *outputs);
+      }
+    }
+
+    if (!*refined) {
+      // No input shape has changed, we're done
+      return Status::OK();
+    }
+
+    node_context->inference_context->set_input_tensors(input_tensors);
+    node_context->inference_context->set_input_tensors_as_shapes(
+        input_tensors_as_shapes);
+
+    // Update the shapes of the outputs.
+    return InferShapes(node, node_context);
   }
+
   Status SetUnknownShape(const Node* node, int output_port) {
     shape_inference::ShapeHandle shape =
         GetUnknownOutputShape(node, output_port);
@@ -450,7 +557,7 @@ class SymbolicShapeRefiner {
     if (shape1.SameHandle(shape2)) {
       return shape1;
     }
-    InferenceContext* ctx = shape_refiner_->GetContext(node);
+    InferenceContext* ctx = GetContext(node);
     ShapeHandle merged = shape1;
     if (!ctx->RankKnown(shape2) && !ctx->RankKnown(shape1)) {
       // Return either one since they're expected to represent the same value.
@@ -495,7 +602,7 @@ class SymbolicShapeRefiner {
     if (shape1.SameHandle(shape2)) {
       return shape1;
     }
-    InferenceContext* ctx = shape_refiner_->GetContext(node);
+    InferenceContext* ctx = GetContext(node);
     ShapeHandle relaxed = shape1;
     const int rank = ctx->Rank(shape1);
     if (!ctx->RankKnown(shape2) || ctx->Rank(shape2) != rank) {
@@ -569,7 +676,7 @@ class SymbolicShapeRefiner {
     if (it != unknown_shapes_.end()) {
       return it->second;
     }
-    InferenceContext* c = shape_refiner_->GetContext(node);
+    InferenceContext* c = GetContext(node);
     ShapeHandle shp = c->UnknownShape();
     unknown_shapes_[id] = shp;
     return shp;
@@ -582,16 +689,114 @@ class SymbolicShapeRefiner {
     if (it != unknown_dims_.end()) {
       return it->second;
     }
-    InferenceContext* c = shape_refiner_->GetContext(node);
+    InferenceContext* c = GetContext(node);
     DimensionHandle dim = c->UnknownDim();
     unknown_dims_[id] = dim;
     return dim;
   }
 
-  ShapeRefiner* shape_refiner_;
+  Status AddNode(const Node* node) {
+    // Create the inference context for this node.
+    std::vector<ShapeHandle> input_shapes(node->num_inputs());
+    std::vector<std::unique_ptr<std::vector<ShapeAndType>>>
+        input_handle_shapes_and_types(node->num_inputs());
+    std::vector<const Tensor*> input_tensors(node->num_inputs(), nullptr);
+    std::vector<ShapeHandle> input_tensors_as_shapes;
+
+    NodeContext& node_ctx = node_to_context_[node];
+    node_ctx.inference_context.reset(new InferenceContext(
+        graph_def_version_, &node->def(), node->op_def(), input_shapes,
+        input_tensors, input_tensors_as_shapes,
+        std::move(input_handle_shapes_and_types)));
+    const Status s = node_ctx.inference_context->construction_status();
+    if (!s.ok()) {
+      node_ctx.inference_context.reset(nullptr);
+    }
+    return s;
+  }
+
+  struct NodeContext {
+    std::unique_ptr<InferenceContext> inference_context;
+    std::vector<ShapeHandle> output_tensors_as_shapes;
+  };
+
+  Status InferShapes(const Node* node, NodeContext* c) {
+    InferenceContext* ic = c->inference_context.get();
+
+    // Propagate shape tensors
+    if (node->type_string() == "Shape") {
+      c->output_tensors_as_shapes.resize(1);
+      c->output_tensors_as_shapes[0] = c->inference_context->input(0);
+    } else if (node->type_string() == "ShapeN") {
+      c->output_tensors_as_shapes.resize(c->inference_context->num_inputs());
+      for (int i = 0; i < c->inference_context->num_inputs(); ++i) {
+        c->output_tensors_as_shapes[i] = c->inference_context->input(i);
+      }
+    } else if (node->type_string() == "ConcatV2") {
+      bool valid = true;
+      ShapeHandle result;
+      for (int i = 0; i < ic->num_inputs() - 1; ++i) {
+        ShapeHandle input = ic->input_tensors_as_shapes()[i];
+        if (!ic->RankKnown(input)) {
+          valid = false;
+          break;
+        } else if (i == 0) {
+          result = input;
+        } else {
+          TF_RETURN_IF_ERROR(ic->Concatenate(result, input, &result));
+        }
+      }
+      if (valid) {
+        c->output_tensors_as_shapes.resize(1);
+        c->output_tensors_as_shapes[0] = result;
+      }
+    } else if (node->type_string() == "Slice") {
+      ShapeHandle input = ic->input_tensors_as_shapes()[0];
+      bool valid = ic->RankKnown(input);
+      const Tensor* slice_offset = ic->input_tensor(1);
+      valid &= slice_offset != nullptr && slice_offset->NumElements() == 1;
+      const Tensor* slice_size = ic->input_tensor(2);
+      valid &= slice_size != nullptr && slice_size->NumElements() == 1;
+      if (valid) {
+        int64 start = slice_offset->dtype() == DT_INT32
+                          ? slice_offset->flat<int32>()(0)
+                          : slice_offset->flat<int64>()(0);
+        int64 end = start + (slice_size->dtype() == DT_INT32
+                                 ? slice_size->flat<int32>()(0)
+                                 : slice_size->flat<int64>()(0));
+        ShapeHandle result;
+        TF_RETURN_IF_ERROR(ic->Subshape(input, start, end, &result));
+        c->output_tensors_as_shapes.resize(1);
+        c->output_tensors_as_shapes[0] = result;
+      }
+    }
+
+    // Infer the shapes of output tensors.
+    const OpRegistrationData* op_reg_data;
+    Status s = function_library_.default_registry()->LookUp(node->type_string(),
+                                                            &op_reg_data);
+    if (!s.ok() || op_reg_data->shape_inference_fn == nullptr) {
+      // There is nothing more we can infer, annotate outputs with unknown
+      // shapes
+      return c->inference_context->Run(shape_inference::UnknownShape);
+    }
+
+    return c->inference_context->Run(op_reg_data->shape_inference_fn);
+  }
+
+  NodeContext* GetNodeContext(const Node* node) {
+    auto it = node_to_context_.find(node);
+    if (it == node_to_context_.end()) {
+      return nullptr;
+    }
+    return &it->second;
+  }
 
+  int graph_def_version_;
+  std::unordered_map<const Node*, NodeContext> node_to_context_;
   std::unordered_map<ShapeId, ShapeHandle, HashShapeId> unknown_shapes_;
   std::unordered_map<DimId, DimensionHandle, HashDimId> unknown_dims_;
+  FunctionLibraryDefinition function_library_;
 };
 
 // Keep track of shapes and dimensions in a graph.
@@ -977,9 +1182,6 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
                                              item_.graph.library());
   Graph graph(function_library);
   graph_ = &graph;
-  ShapeRefiner shape_refiner(graph.versions(), graph.op_registry());
-  shape_refiner.set_require_shape_inference_fns(false);
-  shape_refiner.set_disable_constant_propagation(true);
   ImportGraphDefOptions options;
   // Graph optimization happens at the late stage of graph execution,
   // when colocation constraints are already validated previously and
@@ -987,7 +1189,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
   // is no need to validate colocation constraints again.
   options.validate_colocation_constraints = false;
   options.validate_shape = false;
-  Status s = ImportGraphDef(options, item_.graph, &graph, &shape_refiner);
+  Status s = ImportGraphDef(options, item_.graph, &graph, nullptr);
   TF_RETURN_IF_ERROR(s);
 
   std::unordered_map<string, std::unordered_set<int>> fed_ports;
@@ -1041,7 +1243,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
     }
   }
 
-  SymbolicShapeRefiner refiner(&shape_refiner);
+  SymbolicShapeRefiner refiner(item_.graph);
 
   // We propagate shapes through the graph in two phases. In the first phase, we
   // exclusively merge shapes but we do not propagate shapes through the
@@ -1073,7 +1275,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
   SymbolicShapeManager shape_manager;
   bool found_error = false;
   for (const Node* const node : graph.nodes()) {
-    auto node_ctx = shape_refiner.GetContext(node);
+    auto node_ctx = refiner.GetContext(node);
     if (!node_ctx) {
       continue;
     }
@@ -1105,7 +1307,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
 
   for (const Node* const node : graph.nodes()) {
     VLOG(3) << "Filling in graph properties for node: " << node->name();
-    auto ctx = shape_refiner.GetContext(node);
+    auto ctx = refiner.GetContext(node);
     if (!ctx) {
       continue;
     }
-- 
GitLab


From bdcca449fc22cf1d8a1d6a2c01c3b67706d6023b Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Thu, 19 Apr 2018 10:14:09 -0700
Subject: [PATCH 147/450] Prototype for tf.data writer API.

PiperOrigin-RevId: 193534333
---
 .../contrib/data/python/kernel_tests/BUILD    |  20 +++
 .../python/kernel_tests/writer_ops_test.py    | 117 ++++++++++++++++++
 tensorflow/contrib/data/python/ops/BUILD      |  13 ++
 tensorflow/contrib/data/python/ops/writers.py |  58 +++++++++
 .../base_api/api_def_DatasetToTFRecord.pbtxt  |  24 ++++
 tensorflow/core/framework/dataset.h           |   4 +-
 tensorflow/core/kernels/data/BUILD            |  14 +++
 tensorflow/core/kernels/data/writer_ops.cc    | 113 +++++++++++++++++
 tensorflow/core/ops/dataset_ops.cc            |   6 +
 9 files changed, 367 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py
 create mode 100644 tensorflow/contrib/data/python/ops/writers.py
 create mode 100644 tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt
 create mode 100644 tensorflow/core/kernels/data/writer_ops.cc

diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index c554607960..83daa04efc 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -516,3 +516,23 @@ tf_py_test(
         "//third_party/py/numpy",
     ],
 )
+
+tf_py_test(
+    name = "writer_ops_test",
+    size = "small",
+    srcs = ["writer_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/contrib/data/python/ops:writers",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
diff --git a/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py
new file mode 100644
index 0000000000..c603ecc5ab
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py
@@ -0,0 +1,117 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.contrib.data.python.ops import writers
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import dtypes
+from tensorflow.python.lib.io import python_io
+from tensorflow.python.lib.io import tf_record
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class TFRecordWriterTest(test.TestCase):
+
+  def setUp(self):
+    super(TFRecordWriterTest, self).setUp()
+    self._num_records = 7
+    self.filename = array_ops.placeholder(dtypes.string, shape=[])
+    self.compression_type = array_ops.placeholder_with_default("", shape=[])
+
+    input_dataset = readers.TFRecordDataset([self.filename],
+                                            self.compression_type)
+    self.writer = writers.TFRecordWriter(
+        self._outputFilename(), self.compression_type).write(input_dataset)
+
+  def _record(self, i):
+    return compat.as_bytes("Record %d" % (i))
+
+  def _createFile(self, options=None):
+    filename = self._inputFilename()
+    writer = python_io.TFRecordWriter(filename, options)
+    for i in range(self._num_records):
+      writer.write(self._record(i))
+    writer.close()
+    return filename
+
+  def _inputFilename(self):
+    return os.path.join(self.get_temp_dir(), "tf_record.in.txt")
+
+  def _outputFilename(self):
+    return os.path.join(self.get_temp_dir(), "tf_record.out.txt")
+
+  def testWrite(self):
+    with self.test_session() as sess:
+      sess.run(
+          self.writer, feed_dict={
+              self.filename: self._createFile(),
+          })
+    for i, r in enumerate(tf_record.tf_record_iterator(self._outputFilename())):
+      self.assertAllEqual(self._record(i), r)
+
+  def testWriteZLIB(self):
+    options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.ZLIB)
+    with self.test_session() as sess:
+      sess.run(
+          self.writer,
+          feed_dict={
+              self.filename: self._createFile(options),
+              self.compression_type: "ZLIB",
+          })
+    for i, r in enumerate(
+        tf_record.tf_record_iterator(self._outputFilename(), options=options)):
+      self.assertAllEqual(self._record(i), r)
+
+  def testWriteGZIP(self):
+    options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.GZIP)
+    with self.test_session() as sess:
+      sess.run(
+          self.writer,
+          feed_dict={
+              self.filename: self._createFile(options),
+              self.compression_type: "GZIP",
+          })
+    for i, r in enumerate(
+        tf_record.tf_record_iterator(self._outputFilename(), options=options)):
+      self.assertAllEqual(self._record(i), r)
+
+  def testFailDataset(self):
+    with self.assertRaises(TypeError):
+      writers.TFRecordWriter(self._outputFilename(),
+                             self.compression_type).write("whoops")
+
+  def testFailDType(self):
+    input_dataset = dataset_ops.Dataset.from_tensors(10)
+    with self.assertRaises(TypeError):
+      writers.TFRecordWriter(self._outputFilename(),
+                             self.compression_type).write(input_dataset)
+
+  def testFailShape(self):
+    input_dataset = dataset_ops.Dataset.from_tensors([["hello"], ["world"]])
+    with self.assertRaises(TypeError):
+      writers.TFRecordWriter(self._outputFilename(),
+                             self.compression_type).write(input_dataset)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index e00f2304cc..5b04c5316c 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -280,6 +280,18 @@ py_library(
     ],
 )
 
+py_library(
+    name = "writers",
+    srcs = [
+        "writers.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 tf_gen_op_wrapper_py(
     name = "gen_dataset_ops",
     out = "gen_dataset_ops.py",
@@ -342,6 +354,7 @@ py_library(
         ":stats_ops",
         ":threadpool",
         ":unique",
+        ":writers",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
diff --git a/tensorflow/contrib/data/python/ops/writers.py b/tensorflow/contrib/data/python/ops/writers.py
new file mode 100644
index 0000000000..f53bd3f738
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/writers.py
@@ -0,0 +1,58 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python wrappers for tf.data writers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import convert
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import gen_dataset_ops
+
+
+class TFRecordWriter(object):
+  """Writes data to a TFRecord file."""
+
+  def __init__(self, filename, compression_type=None):
+    self._filename = ops.convert_to_tensor(
+        filename, dtypes.string, name="filename")
+    self._compression_type = convert.optional_param_to_tensor(
+        "compression_type",
+        compression_type,
+        argument_default="",
+        argument_dtype=dtypes.string)
+
+  def write(self, dataset):
+    """Returns a @{tf.Operation} to write a dataset to a file.
+
+    Args:
+      dataset: a @{tf.data.Dataset} whose elements are to be written to a file
+
+    Returns:
+      A @{tf.Operation} that, when run, writes contents of `dataset` to a file.
+    """
+    if not isinstance(dataset, dataset_ops.Dataset):
+      raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
+    if (dataset.output_types != dtypes.string or
+        dataset.output_shapes != tensor_shape.scalar()):
+      raise TypeError(
+          "`dataset` must produce scalar `DT_STRING` tensors whereas it "
+          "produces shape {0} and types {1}".format(dataset.output_shapes,
+                                                    dataset.output_types))
+    return gen_dataset_ops.dataset_to_tf_record(
+        dataset._as_variant_tensor(), self._filename, self._compression_type)  # pylint: disable=protected-access
diff --git a/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt b/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt
new file mode 100644
index 0000000000..e1b8a9abdd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "DatasetToTFRecord"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the dataset to write.
+END
+  }
+  in_arg {
+    name: "filename"
+    description: <<END
+A scalar string tensor representing the filename to use.
+END
+  }
+  in_arg {
+    name: "compression_type"
+    description: <<END
+A scalar string tensor containing either (i) the empty string (no
+compression), (ii) "ZLIB", or (iii) "GZIP".
+END
+  }
+  summary: "Writes the given dataset to the given file using the TFRecord format."
+}
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 9e7ffe6c0b..8d127baac4 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -364,7 +364,7 @@ class IteratorBase {
  protected:
   // This is needed so that sub-classes of IteratorBase can call
   // `SaveInternal` on their parent iterators, e.g., in
-  // `RepeatDataasetOp::Dataset`.
+  // `RepeatDatasetOp::Dataset`.
   Status SaveParent(IteratorStateWriter* writer,
                     const std::unique_ptr<IteratorBase>& parent) {
     return parent->SaveInternal(writer);
@@ -372,7 +372,7 @@ class IteratorBase {
 
   // This is needed so that sub-classes of IteratorBase can call
   // `RestoreInternal` on their parent iterators, e.g., in
-  // `RepeatDataasetOp::Dataset`.
+  // `RepeatDatasetOp::Dataset`.
   Status RestoreParent(IteratorContext* ctx, IteratorStateReader* reader,
                        const std::unique_ptr<IteratorBase>& parent) {
     return parent->RestoreInternal(ctx, reader);
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 1e96eb6421..667a6967a8 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -576,6 +576,20 @@ tf_kernel_library(
         ":tensor_queue_dataset_op",
         ":tensor_slice_dataset_op",
         ":unbatch_dataset_op",
+        ":writer_ops",
         ":zip_dataset_op",
     ],
 )
+
+tf_kernel_library(
+    name = "writer_ops",
+    srcs = ["writer_ops.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
diff --git a/tensorflow/core/kernels/data/writer_ops.cc b/tensorflow/core/kernels/data/writer_ops.cc
new file mode 100644
index 0000000000..46821fd7b3
--- /dev/null
+++ b/tensorflow/core/kernels/data/writer_ops.cc
@@ -0,0 +1,113 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/record_writer.h"
+#include "tensorflow/core/platform/file_system.h"
+
+namespace tensorflow {
+
+namespace {
+
+class ToTFRecordOp : public AsyncOpKernel {
+ public:
+  explicit ToTFRecordOp(OpKernelConstruction* ctx)
+      : AsyncOpKernel(ctx),
+        thread_pool_(new thread::ThreadPool(
+            ctx->env(), ThreadOptions(),
+            strings::StrCat("to_tf_record__op_", SanitizeThreadSuffix(name())),
+            1 /* num_threads */, false /* low_latency_hint */)) {}
+
+  template <typename T>
+  Status ParseScalarArgument(OpKernelContext* ctx,
+                             const StringPiece& argument_name, T* output) {
+    const Tensor* argument_t;
+    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
+    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
+      return errors::InvalidArgument(argument_name, " must be a scalar");
+    }
+    *output = argument_t->scalar<T>()();
+    return Status::OK();
+  }
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    // The call to `iterator->GetNext()` may block and depend on an
+    // inter-op thread pool thread, so we issue the call from the
+    // owned thread pool.
+    thread_pool_->Schedule([this, ctx, done]() {
+      string filename;
+      OP_REQUIRES_OK_ASYNC(
+          ctx, ParseScalarArgument<string>(ctx, "filename", &filename), done);
+      string compression_type;
+      OP_REQUIRES_OK_ASYNC(ctx,
+                           ParseScalarArgument<string>(ctx, "compression_type",
+                                                       &compression_type),
+                           done);
+      std::unique_ptr<WritableFile> file;
+      OP_REQUIRES_OK_ASYNC(ctx, ctx->env()->NewWritableFile(filename, &file),
+                           done);
+      std::unique_ptr<io::RecordWriter> writer;
+      writer.reset(new io::RecordWriter(
+          file.get(), io::RecordWriterOptions::CreateRecordWriterOptions(
+                          compression_type)));
+
+      DatasetBase* dataset;
+      OP_REQUIRES_OK_ASYNC(
+          ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
+      auto iterator = dataset->MakeIterator("ToTFRecordOpIterator");
+
+      IteratorContext::Params params;  // TODO(b/78245447)
+      params.env = ctx->env();
+      params.runner = *(ctx->runner());
+      params.lib = ctx->function_library();
+      DeviceBase* device = ctx->function_library()->device();
+      params.allocator_getter = [device](AllocatorAttributes attrs) {
+        return device->GetAllocator(attrs);
+      };
+
+      IteratorContext iter_ctx(std::move(params));
+
+      std::vector<Tensor> components;
+      components.reserve(dataset->output_dtypes().size());
+      bool end_of_sequence;
+
+      do {
+        OP_REQUIRES_OK_ASYNC(
+            ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
+            done);
+
+        if (!end_of_sequence) {
+          OP_REQUIRES_OK_ASYNC(
+              ctx, writer->WriteRecord(components[0].scalar<string>()()), done);
+        }
+        components.clear();
+      } while (!end_of_sequence);
+      done();
+    });
+  }
+
+ private:
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("DatasetToTFRecord").Device(DEVICE_CPU),
+                        ToTFRecordOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 8be569b315..67c6c58fe2 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -551,4 +551,10 @@ REGISTER_OP("EnqueueInQueueDataset")
     // reading from queue handle (is that even possible?).
     .SetShapeFn(shape_inference::NoOutputs);
 
+REGISTER_OP("DatasetToTFRecord")
+    .Input("input_dataset: variant")
+    .Input("filename: string")
+    .Input("compression_type: string")
+    .SetShapeFn(shape_inference::NoOutputs);
+
 }  // namespace tensorflow
-- 
GitLab


From 5fbd21e3bbd4f89dd2c6eed8a63b66ee2eff40a0 Mon Sep 17 00:00:00 2001
From: Ian Langmore <langmore@google.com>
Date: Thu, 19 Apr 2018 10:20:43 -0700
Subject: [PATCH 148/450] distribution_util moved into its own BUILD target, so
 linear_operator can depend on it.

PiperOrigin-RevId: 193535400
---
 tensorflow/python/ops/distributions/BUILD | 26 ++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/distributions/BUILD b/tensorflow/python/ops/distributions/BUILD
index 9d9ede7ad7..e7ad028376 100644
--- a/tensorflow/python/ops/distributions/BUILD
+++ b/tensorflow/python/ops/distributions/BUILD
@@ -8,9 +8,13 @@ licenses(["notice"])  # Apache 2.0
 
 py_library(
     name = "distributions",
-    srcs = glob(["*.py"]),
+    srcs = glob(
+        ["*.py"],
+        exclude = ["util.py"],
+    ),
     srcs_version = "PY2AND3",
     deps = [
+        ":util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:control_flow_ops",
@@ -26,3 +30,23 @@ py_library(
         "@six_archive//:six",
     ],
 )
+
+py_library(
+    name = "util",
+    srcs = ["util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:special_math_ops",
+        "//tensorflow/python:tensor_util",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
-- 
GitLab


From 72240a9b5e67e315f6c037bb4579df9709335e35 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Fri, 20 Apr 2018 01:23:54 +0800
Subject: [PATCH 149/450] fix single paragraph format and also arrow like
 format

---
 tensorflow/contrib/optimizer_v2/adam.py          | 16 ++++++++--------
 .../api_def/base_api/api_def_ApplyAdam.pbtxt     |  8 ++++----
 .../base_api/api_def_ResourceApplyAdam.pbtxt     |  8 ++++----
 tensorflow/python/training/adam.py               | 16 ++++++++--------
 4 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py
index a38c98f471..76a867039a 100644
--- a/tensorflow/contrib/optimizer_v2/adam.py
+++ b/tensorflow/contrib/optimizer_v2/adam.py
@@ -40,19 +40,19 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):
 
     Initialization:
 
-    \\(m_0 <- 0\\) (Initialize initial 1st moment vector)
-    \\(v_0 <- 0\\) (Initialize initial 2nd moment vector)
-    \\(t <- 0\\) (Initialize timestep)
+    $$m_0 \Leftarrow 0 (Initialize initial 1st moment vector)$$
+    $$v_0 \Leftarrow 0 (Initialize initial 2nd moment vector)$$
+    $$t \Leftarrow 0 (Initialize timestep)$$
 
     The update rule for `variable` with gradient `g` uses an optimization
     described at the end of section2 of the paper:
 
-    $$t <- t + 1$$
-    $$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+    $$t \Leftarrow t + 1$$
+    $$lr_t \Leftarrow \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
 
-    $$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$
-    $$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-    $$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+    $$m_t \Leftarrow beta_1 * m_{t-1} + (1 - beta_1) * g$$
+    $$v_t \Leftarrow beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+    $$variable \Leftarrow variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 
     The default value of 1e-8 for epsilon might not be a good default in
     general. For example, when training an Inception network on ImageNet a
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
index fc2cb09471..fca8ba2530 100644
--- a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
@@ -82,9 +82,9 @@ END
   }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: <<END
-$$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
-$$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$
-$$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-$$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+$$lr_t \Leftarrow \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+$$m_t \Leftarrow beta_1 * m_{t-1} + (1 - beta_1) * g$$
+$$v_t \Leftarrow beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+$$variable \Leftarrow variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
index 5c60fa3aa1..8b16d824bf 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
@@ -76,8 +76,8 @@ END
   }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: <<END
-$$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
-$$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$
-$$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-$$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+$$lr_t \Leftarrow \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+$$m_t \Leftarrow beta_1 * m_{t-1} + (1 - beta_1) * g$$
+$$v_t \Leftarrow beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+$$variable \Leftarrow variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 }
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index dc0f1aba09..9f523a3aca 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -43,19 +43,19 @@ class AdamOptimizer(optimizer.Optimizer):
 
     Initialization:
 
-    \\(m_0 <- 0\\) (Initialize initial 1st moment vector)
-    \\(v_0 <- 0\\) (Initialize initial 2nd moment vector)
-    \\(t <- 0\\) (Initialize timestep)
+    $$m_0 \Leftarrow 0 (Initialize initial 1st moment vector)$$
+    $$v_0 \Leftarrow 0 (Initialize initial 2nd moment vector)$$
+    $$t \Leftarrow 0 (Initialize timestep)$$
 
     The update rule for `variable` with gradient `g` uses an optimization
     described at the end of section2 of the paper:
 
-    $$t <- t + 1$$
-    $$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+    $$t \Leftarrow t + 1$$
+    $$lr_t \Leftarrow \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
 
-    $$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$
-    $$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-    $$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+    $$m_t \Leftarrow beta_1 * m_{t-1} + (1 - beta_1) * g$$
+    $$v_t \Leftarrow beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+    $$variable \Leftarrow variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 
     The default value of 1e-8 for epsilon might not be a good default in
     general. For example, when training an Inception network on ImageNet a
-- 
GitLab


From ba3bc495bbf1140e9375e1ec03c3ff788b8ebc6e Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Thu, 19 Apr 2018 10:26:54 -0700
Subject: [PATCH 150/450] Add metric names to model.metrics_names in compile
 for keras models run in eager execution. This prevents us from dropping
 metrics when we run model.evaluate.

PiperOrigin-RevId: 193536341
---
 .../keras/_impl/keras/engine/training.py      | 29 ++-------
 .../_impl/keras/engine/training_eager.py      | 39 ++++--------
 .../_impl/keras/engine/training_eager_test.py | 12 ++--
 .../keras/_impl/keras/engine/training_test.py | 26 ++++++++
 .../_impl/keras/engine/training_utils.py      | 62 +++++++++++++++++++
 5 files changed, 109 insertions(+), 59 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index 7c46743814..012d9ceea4 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -276,6 +276,8 @@ class Model(Network):
           self.metrics_names.append(self.output_names[i] + '_loss')
       self.nested_metrics = training_utils.collect_metrics(metrics,
                                                            self.output_names)
+      with K.name_scope('metrics'):
+        training_utils.populate_metric_names(self)
       self._feed_sample_weight_modes = []
       for i in range(len(self.outputs)):
         self._feed_sample_weight_modes.append(None)
@@ -462,7 +464,6 @@ class Model(Network):
         output_weighted_metrics = nested_weighted_metrics[i]
 
         def handle_metrics(metrics, weights=None):
-          metric_name_prefix = 'weighted_' if weights is not None else ''
 
           for metric in metrics:
             if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
@@ -489,39 +490,19 @@ class Model(Network):
                   metric_fn = metrics_module.categorical_accuracy
                 elif metric in ('crossentropy', 'ce'):
                   metric_fn = metrics_module.categorical_crossentropy
-              if metric in ('accuracy', 'acc'):
-                suffix = 'acc'
-              elif metric in ('crossentropy', 'ce'):
-                suffix = 'ce'
               weighted_metric_fn = training_utils.weighted_masked_objective(
                   metric_fn)
-              metric_name = metric_name_prefix + suffix
             else:
               metric_fn = metrics_module.get(metric)
               weighted_metric_fn = training_utils.weighted_masked_objective(
                   metric_fn)
-              # Get metric name as string
-              if hasattr(metric_fn, 'name'):
-                metric_name = metric_fn.name
-              else:
-                metric_name = metric_fn.__name__
-              metric_name = metric_name_prefix + metric_name
-
+            metric_name = training_utils.get_base_metric_name(
+                metric, weighted=weights is not None)
             with K.name_scope(metric_name):
               metric_result = weighted_metric_fn(
                   y_true, y_pred, weights=weights, mask=masks[i])
 
-            # Append to self.metrics_names, self.metric_tensors,
-            # self.stateful_metric_names
-            if len(self.output_names) > 1:
-              metric_name = '%s_%s' % (self.output_names[i], metric_name)
-            # Dedupe name
-            j = 1
-            base_metric_name = metric_name
-            while metric_name in self.metrics_names:
-              metric_name = '%s_%d' % (base_metric_name, j)
-              j += 1
-            self.metrics_names.append(metric_name)
+            training_utils.add_metric_name(self, metric_name, i)
             self.metrics_tensors.append(metric_result)
 
             # Keep track of state updates created by
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
index 695669d9ee..ad239d6151 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
@@ -100,7 +100,7 @@ def _eager_metrics_fn(model, outputs, targets):
         metric_names.append(metric_name)
         metric_results.append(backend.mean(metric_result))
 
-  return metric_names, metric_results
+  return metric_results
 
 
 def _model_loss(model, inputs, targets, sample_weights=None, training=False):
@@ -151,7 +151,12 @@ def _model_loss(model, inputs, targets, sample_weights=None, training=False):
       with backend.name_scope(model.output_names[i] + '_loss'):
         output_loss = weighted_masked_fn(
             targets[i], outs[i], weights, mask=mask)
-      loss_metrics.append(backend.mean(output_loss))
+      # If the number of outputs is 1 then we don't append the loss metric
+      # associated with each model output. When there are multiple outputs
+      # associated with a model, each output's loss is calculated and returned
+      # as part of the loss_metrics.
+      if len(model.outputs) > 1:
+        loss_metrics.append(backend.mean(output_loss))
 
       loss_weight = model.loss_weights_list[i]
       if total_loss is None:
@@ -274,7 +279,7 @@ def train_on_batch(model, inputs, targets, sample_weights=None):
       model, inputs, targets, sample_weights=sample_weights, training=True)
   if not isinstance(outs, list):
     outs = [outs]
-  _, metrics_results = _eager_metrics_fn(
+  metrics_results = _eager_metrics_fn(
       model, outs, targets)
   if not isinstance(loss, list):
     loss = [loss]
@@ -304,7 +309,7 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
       model, inputs, targets, sample_weights=sample_weights, training=False)
   if not isinstance(outs, list):
     outs = [outs]
-  _, metrics_results = _eager_metrics_fn(
+  metrics_results = _eager_metrics_fn(
       model, outs, targets)
   if not isinstance(loss, list):
     loss = [loss]
@@ -498,34 +503,12 @@ def fit_loop(
         for l, o in zip(out_labels, outs):
           batch_logs[l] = o
         # Required for Eager mode
-        metrics_names, metrics_results = _eager_metrics_fn(
-            model, outs, targets_batch)
+        metrics_results = _eager_metrics_fn(model, outs, targets_batch)
         batch_logs['loss'] = tensor_util.constant_value(backend.mean(loss))
 
-        # TODO(anjalisridhar): Move this to compile to avoid duplicate code.
-        # In graph mode we set the metric names in compile. However in
-        # Eager mode we calculate the metrics for each batch in fit_loop.
-        # We could calculate the metric names and functions in compile.
-        # This would avoid setting the callback parameters separately.
-        # We need to do this for the first iteration alone
-        for m in metrics_names:
-          if m not in callback_metrics:
-            callback_metrics.append(m)
-
-        callbacks.set_params({
-            'batch_size': batch_size,
-            'epochs': epochs,
-            'steps': steps_per_epoch,
-            'samples': num_train_samples,
-            'verbose': verbose,
-            'do_validation': do_validation,
-            'metrics': callback_metrics or [],
-        })
-
         for k, v in zip(model.metrics_names,
                         [backend.mean(loss)] + loss_metrics + metrics_results):
           batch_logs[k] = tensor_util.constant_value(v)
-
         callbacks.on_batch_end(batch_index, batch_logs)
         if callback_model.stop_training:
           break
@@ -611,7 +594,7 @@ def test_loop(model, inputs, targets,
           targets_batch,
           sample_weights=sample_weights_batch,
           training=False)
-      _, metrics_results = _eager_metrics_fn(model, loss_outs, targets_batch)
+      metrics_results = _eager_metrics_fn(model, loss_outs, targets_batch)
       batch_outs = []
       for _, v in zip(model.metrics_names,
                       [backend.mean(loss)] + loss_metrics + metrics_results):
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
index ed0f91ee1e..deaf1d1306 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
@@ -212,7 +212,7 @@ class TrainingTest(test.TestCase):
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
     loss_weights = [1., 0.5]
-    metrics = ['mae']
+    metrics = ['acc', 'mae']
     model.compile(
         optimizer,
         loss,
@@ -231,20 +231,20 @@ class TrainingTest(test.TestCase):
         [input_a_np, input_b_np], [output_d_np, output_e_np],
         batch_size=5,
         verbose=0)
-    self.assertEqual(len(out), 5)
+    self.assertEqual(len(out), 7)
     out = model.evaluate(
         [input_a_np, input_b_np], [output_d_np, output_e_np],
         batch_size=5,
         verbose=1)
-    self.assertEqual(len(out), 5)
+    self.assertEqual(len(out), 7)
     out = model.evaluate(
         [input_a_np, input_b_np], [output_d_np, output_e_np],
         batch_size=5,
         verbose=2)
-    self.assertEqual(len(out), 5)
+    self.assertEqual(len(out), 7)
     out = model.test_on_batch([input_a_np, input_b_np],
                               [output_d_np, output_e_np])
-    self.assertEqual(len(out), 5)
+    self.assertEqual(len(out), 7)
 
     # Test evaluate with dictionary inputs
     model.evaluate(
@@ -625,7 +625,6 @@ class LossWeightingTest(test.TestCase):
       bad_w_np = np.random.random((10, 2, 2))
       model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np})
 
-
 class CorrectnessTest(test.TestCase):
 
   @tf_test_util.run_in_graph_and_eager_modes()
@@ -649,7 +648,6 @@ class CorrectnessTest(test.TestCase):
     self.assertEqual(
         np.around(history.history['loss'][-1], decimals=4), 0.6173)
 
-
 if __name__ == '__main__':
   ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py
index 6699fd5212..d9281436de 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py
@@ -24,12 +24,15 @@ import unittest
 import numpy as np
 
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.keras._impl.keras.engine.training_utils import weighted_masked_objective
 from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
+
 
 try:
   import scipy.sparse as scipy_sparse  # pylint: disable=g-import-not-at-top
@@ -1684,6 +1687,29 @@ class TestTrainingWithDataTensors(test.TestCase):
       model.train_on_batch([input_a_np, input_b_np],
                            [output_a_np, output_b_np])
 
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_metric_names_are_identical_in_graph_and_eager(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    metrics = ['mae', 'acc']
+    model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+    reference_metric_names = ['loss', 'dense_loss', 'dropout_loss',
+                              'dense_mean_absolute_error',
+                              'dense_acc',
+                              'dropout_mean_absolute_error',
+                              'dropout_acc']
+    self.assertEqual(reference_metric_names, model.metrics_names)
 
 if __name__ == '__main__':
   # Bazel sets these environment variables to very long paths.
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_utils.py b/tensorflow/python/keras/_impl/keras/engine/training_utils.py
index 48afe48e6c..662938f421 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_utils.py
@@ -26,6 +26,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import losses
+from tensorflow.python.keras._impl.keras import metrics as metrics_module
 from tensorflow.python.ops import math_ops
 
 
@@ -552,3 +553,64 @@ def standardize_weights(y,
 def has_symbolic_tensors(ls):
   return (any(tensor_util.is_tensor(v) for v in ls)
           and not context.executing_eagerly())
+
+
+def populate_metric_names(model):
+  for i in range(len(model.outputs)):
+    metrics = model.nested_metrics[i]
+    for metric in metrics:
+      base_metric_name = get_base_metric_name(metric)
+      add_metric_name(model, base_metric_name, i)
+
+
+def get_base_metric_name(metric, weighted=False):
+  """Returns the metric name given the metric function.
+
+  Arguments:
+      metric: Metric function name or reference.
+      weighted: Boolean indicating if the metric for which we are adding
+          names is weighted.
+
+  Returns:
+      a metric name.
+  """
+  metric_name_prefix = 'weighted_' if weighted else ''
+  if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
+    if metric in ('accuracy', 'acc'):
+      suffix = 'acc'
+    elif metric in ('crossentropy', 'ce'):
+      suffix = 'ce'
+    metric_name = metric_name_prefix + suffix
+  else:
+    metric_fn = metrics_module.get(metric)
+    # Get metric name as string
+    if hasattr(metric_fn, 'name'):
+      metric_name = metric_fn.name
+    else:
+      metric_name = metric_fn.__name__
+    metric_name = metric_name_prefix + metric_name
+
+  return metric_name
+
+
+def add_metric_name(model, metric_name, index):
+  """Makes the metric name unique and adds it to the model's metric name list.
+
+    If there are multiple outputs for which the metrics are calculated, the
+    metric names have to be made unique by appending an integer.
+
+  Arguments:
+    model: Model to which we are adding metric names.
+    metric_name: Metric name that corresponds to the metric specified by the
+        user. For example: 'acc'
+    index: The index of the model output for which the metric name is being
+        added.
+  """
+  if len(model.output_names) > 1:
+    metric_name = '%s_%s' % (model.output_names[index], metric_name)
+  j = 1
+  base_metric_name = metric_name
+  while metric_name in model.metrics_names:
+    metric_name = '%s_%d' % (base_metric_name, j)
+    j += 1
+  model.metrics_names.append(metric_name)
-- 
GitLab


From 6a7779f3384e48012d3e27ae0f48d410f5174d06 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 10:33:42 -0700
Subject: [PATCH 151/450] Fix undefined signed integer overflow by performing
 addition more carefully.

PiperOrigin-RevId: 193537461
---
 .../core/lib/random/random_distributions.h    | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h
index 4cf3a999f6..e963511f5c 100644
--- a/tensorflow/core/lib/random/random_distributions.h
+++ b/tensorflow/core/lib/random/random_distributions.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include <string.h>
 #include <algorithm>
+#include <type_traits>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
@@ -40,6 +41,20 @@ PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x);
 // Helper function to convert two 32-bit integers to a double between [0..1).
 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1);
 
+// Computes a + b. Requires that the result is representable in the destination
+// type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
+// need *not* be representable in that type. (The condition on b excludes the
+// extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot
+// compute.)
+template <typename Int>
+PHILOX_DEVICE_INLINE Int SignedAdd(Int a,
+                                   typename std::make_unsigned<Int>::type b) {
+  // Implementation note: both b_div_2 and b - b_div_2 are positive and
+  // representatble as Int.
+  auto b_div_2 = b >> 1;
+  return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2);
+}
+
 // A class that generates uniform distribution random numbers from the
 // underlying random integer generator.
 // Arguments:
@@ -172,7 +187,7 @@ class UniformDistribution<Generator, int32> {
     typename Generator::ResultType sample = (*gen)();
     ResultType result;
     for (int i = 0; i < kResultElementCount; ++i) {
-      result[i] = lo_ + static_cast<int32>(sample[i] % range_);
+      result[i] = SignedAdd(lo_, sample[i] % range_);
     }
     return result;
   }
@@ -208,7 +223,7 @@ class UniformDistribution<Generator, int64> {
     ResultType result;
     for (int i = 0; i < kResultElementCount; ++i) {
       auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
-      result[i] = lo_ + static_cast<int64>(bits % range_);
+      result[i] = SignedAdd(lo_, bits % range_);
     }
     return result;
   }
-- 
GitLab


From f196351cd4e21ed6c17dcf544e0fa6cfa3030b4e Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 19 Apr 2018 10:57:55 -0700
Subject: [PATCH 152/450] Allow non-isolated worker sessions to borrow
 `WorkerEnv::device_mgr`.

Without this change, a shared resource (e.g. an Iterator) could not be
created in one session `s1`, and used in a later session `s2` after
`s1` was closed, because the iterator might indirectly capture devices
from the previous session, and use them after they are freed when the
`WorkerSession` was deleted.

The current change only affects the singleton "legacy" WorkerSession,
which is never deleted, but this is necessary to switch all sessions
to use separate WorkerSession objects.

PiperOrigin-RevId: 193541426
---
 tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc  |  2 +-
 tensorflow/core/distributed_runtime/BUILD     |  1 +
 .../base_rendezvous_mgr.cc                    |  4 +-
 .../rpc/rpc_rendezvous_mgr.cc                 |  2 +-
 .../core/distributed_runtime/session_mgr.cc   | 40 +++++++++++++------
 .../core/distributed_runtime/session_mgr.h    |  2 +-
 .../distributed_runtime/session_mgr_test.cc   | 23 ++++++-----
 .../distributed_runtime/worker_session.cc     | 38 +++++++++++++++++-
 .../core/distributed_runtime/worker_session.h | 28 +++++++++++--
 9 files changed, 105 insertions(+), 35 deletions(-)

diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
index 28f68cec8c..94f522c04e 100644
--- a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
+++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
@@ -155,7 +155,7 @@ class GdrRemoteRendezvous : public BaseRemoteRendezvous {
     }
 
     Device* dst_device;
-    Status s = sess->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
+    Status s = sess->device_mgr()->LookupDevice(parsed.dst_device, &dst_device);
     if (!s.ok()) {
       sess->worker_cache->ReleaseWorker(src_worker, rwi);
       done(s, Args(), recv_args, Tensor{}, false);
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index b07cb8cdcb..d564727da5 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -133,6 +133,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:ptr_util",
         "//tensorflow/core:worker_proto_cc",
     ],
 )
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index bafd9bfc68..5f6931e008 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -253,13 +253,13 @@ void BaseRemoteRendezvous::SameWorkerRecvDone(
 
   WorkerSession* sess = session();
   Device* src_device;
-  Status s = sess->device_mgr->LookupDevice(parsed.src_device, &src_device);
+  Status s = sess->device_mgr()->LookupDevice(parsed.src_device, &src_device);
   if (!s.ok()) {
     done(s);
     return;
   }
   Device* dst_device;
-  s = sess->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
+  s = sess->device_mgr()->LookupDevice(parsed.dst_device, &dst_device);
   if (!s.ok()) {
     done(s);
     return;
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 067dc5dff5..b8cb538503 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -227,7 +227,7 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
 
   Device* dst_device;
   if (s.ok()) {
-    s = sess->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
+    s = sess->device_mgr()->LookupDevice(parsed.dst_device, &dst_device);
   }
   if (!s.ok()) {
     if (rwi != nullptr) {
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index e51d63cf2b..357e9f8930 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/protobuf/cluster.pb.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 
@@ -33,11 +34,11 @@ SessionMgr::SessionMgr(
     WorkerCacheFactory worker_cache_factory)
     : worker_env_(worker_env),
       default_worker_cache_(std::move(default_worker_cache)),
-      legacy_session_(new WorkerSession(
+      legacy_session_(WorkerSession::CreateWithBorrowedDeviceMgr(
           "", default_worker_name,
           std::unique_ptr<WorkerCacheInterface>(
               new WorkerCacheWrapper(default_worker_cache_.get())),
-          std::unique_ptr<DeviceMgr>(worker_env->device_mgr),
+          worker_env->device_mgr,
           std::unique_ptr<GraphMgr>(
               new GraphMgr(worker_env, worker_env->device_mgr)))),
       worker_cache_factory_(std::move(worker_cache_factory)) {}
@@ -71,19 +72,32 @@ Status SessionMgr::CreateSession(const string& session,
   CHECK(!worker_env_->local_devices.empty())
       << "The WorkerEnv must have at least one device in `local_devices`.";
 
-  std::vector<Device*> renamed_devices;
-  for (Device* d : worker_env_->local_devices) {
-    renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
-        worker_name, d, false, isolate_session_state));
-  }
-  std::unique_ptr<DeviceMgr> device_mgr(new DeviceMgr(renamed_devices));
+  std::shared_ptr<WorkerSession> worker_session;
 
-  std::unique_ptr<GraphMgr> graph_mgr(
-      new GraphMgr(worker_env_, device_mgr.get()));
+  if (isolate_session_state) {
+    // Create a private copy of the DeviceMgr for the WorkerSession.
+    std::vector<Device*> renamed_devices;
+    for (Device* d : worker_env_->local_devices) {
+      renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
+          worker_name, d, false, isolate_session_state));
+    }
 
-  std::shared_ptr<WorkerSession> worker_session(new WorkerSession(
-      session, worker_name, std::unique_ptr<WorkerCacheInterface>(worker_cache),
-      std::move(device_mgr), std::move(graph_mgr)));
+    auto device_mgr = MakeUnique<DeviceMgr>(renamed_devices);
+    auto graph_mgr = MakeUnique<GraphMgr>(worker_env_, device_mgr.get());
+    worker_session.reset(
+        new WorkerSession(session, worker_name,
+                          std::unique_ptr<WorkerCacheInterface>(worker_cache),
+                          std::move(device_mgr), std::move(graph_mgr)));
+  } else {
+    // Borrown the WorkerEnv's DeviceMgr for the WorkerSession, so
+    // that resources using it can use its devices after the
+    // WorkerSession has been deleted.
+    auto graph_mgr = MakeUnique<GraphMgr>(worker_env_, worker_env_->device_mgr);
+    worker_session = WorkerSession::CreateWithBorrowedDeviceMgr(
+        session, worker_name,
+        std::unique_ptr<WorkerCacheInterface>(worker_cache),
+        worker_env_->device_mgr, std::move(graph_mgr));
+  }
 
   sessions_.insert(std::make_pair(session, std::move(worker_session)));
   return Status::OK();
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index 0a10fe240f..04d1d61409 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -65,7 +65,7 @@ class SessionMgr {
   void ClearLogs();
 
  private:
-  const WorkerEnv* const worker_env_;  // Not owned.
+  WorkerEnv* const worker_env_;  // Not owned.
 
   // A note about destruction:
   // We must delete graph_mgr before device_mgr, due to shared
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index 858e636e08..0da333833a 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -43,15 +43,17 @@ class FakeDevice : public Device {
 class SessionMgrTest : public ::testing::Test {
  protected:
   SessionMgrTest()
-      : device_(FakeDevice::MakeCPU(
-            "/job:mnist/replica:0/task:0/device:fakecpu:0")),
-        mgr_(&env_, "/job:mnist/replica:0/task:0",
+      : mgr_(&env_, "/job:mnist/replica:0/task:0",
              std::unique_ptr<WorkerCacheInterface>(), factory_) {
-    TF_CHECK_OK(mgr_.WorkerSessionForSession("", &legacy_session_));
-    env_.local_devices = {device_.get()};
+    Device* device =
+        FakeDevice::MakeCPU("/job:mnist/replica:0/task:0/device:fakecpu:0")
+            .release();
+    env_.local_devices = {device};
+    device_mgr_.reset(new DeviceMgr(env_.local_devices));
+    env_.device_mgr = device_mgr_.get();
   }
 
-  std::unique_ptr<Device> device_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
   WorkerEnv env_;
   SessionMgr::WorkerCacheFactory factory_ =
       [](const ServerDef& server_def, WorkerCacheInterface** worker_cache) {
@@ -59,7 +61,6 @@ class SessionMgrTest : public ::testing::Test {
         return Status::OK();
       };
   SessionMgr mgr_;
-  std::shared_ptr<WorkerSession> legacy_session_;
 };
 
 TEST_F(SessionMgrTest, CreateSessionSimple) {
@@ -84,25 +85,25 @@ TEST_F(SessionMgrTest, CreateSessionIsolateSessionState) {
   TF_EXPECT_OK(mgr_.CreateSession("handle_1", server_def, false));
   std::shared_ptr<WorkerSession> session_1;
   TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_1", &session_1));
-  std::vector<Device*> devices_1 = session_1->device_mgr->ListDevices();
+  std::vector<Device*> devices_1 = session_1->device_mgr()->ListDevices();
   EXPECT_EQ(1, devices_1.size());
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_2", server_def, false));
   std::shared_ptr<WorkerSession> session_2;
   TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_2", &session_2));
-  std::vector<Device*> devices_2 = session_2->device_mgr->ListDevices();
+  std::vector<Device*> devices_2 = session_2->device_mgr()->ListDevices();
   EXPECT_EQ(1, devices_2.size());
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_3", server_def, true));
   std::shared_ptr<WorkerSession> session_3;
   TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_3", &session_3));
-  std::vector<Device*> devices_3 = session_3->device_mgr->ListDevices();
+  std::vector<Device*> devices_3 = session_3->device_mgr()->ListDevices();
   EXPECT_EQ(1, devices_3.size());
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_4", server_def, true));
   std::shared_ptr<WorkerSession> session_4;
   TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_4", &session_4));
-  std::vector<Device*> devices_4 = session_4->device_mgr->ListDevices();
+  std::vector<Device*> devices_4 = session_4->device_mgr()->ListDevices();
   EXPECT_EQ(1, devices_4.size());
 
   EXPECT_EQ(devices_1[0]->resource_manager(), devices_2[0]->resource_manager());
diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc
index 18886babd5..ca6dc1b1de 100644
--- a/tensorflow/core/distributed_runtime/worker_session.cc
+++ b/tensorflow/core/distributed_runtime/worker_session.cc
@@ -95,9 +95,43 @@ WorkerSession::WorkerSession(const string& session_name,
     : session_name(session_name),
       worker_name(worker_name),
       worker_cache(new WorkerFreeListCache(std::move(worker_cache))),
-      device_mgr(std::move(device_mgr)),
       graph_mgr(std::move(graph_mgr)),
       cluster_flr(
-          new ClusterFunctionLibraryRuntime(this, !session_name.empty())) {}
+          new ClusterFunctionLibraryRuntime(this, !session_name.empty())),
+      device_mgr_(std::move(device_mgr)),
+      borrowed_device_mgr_(nullptr) {}
+
+/* static */
+std::shared_ptr<WorkerSession> WorkerSession::CreateWithBorrowedDeviceMgr(
+    const string& session_name, const string& worker_name,
+    std::unique_ptr<WorkerCacheInterface> worker_cache,
+    DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr) {
+  return std::shared_ptr<WorkerSession>(
+      new WorkerSession(session_name, worker_name, std::move(worker_cache),
+                        borrowed_device_mgr, std::move(graph_mgr)));
+}
+
+WorkerSession::WorkerSession(const string& session_name,
+                             const string& worker_name,
+                             std::unique_ptr<WorkerCacheInterface> worker_cache,
+                             DeviceMgr* borrowed_device_mgr,
+                             std::unique_ptr<GraphMgr> graph_mgr)
+    : session_name(session_name),
+      worker_name(worker_name),
+      worker_cache(new WorkerFreeListCache(std::move(worker_cache))),
+      graph_mgr(std::move(graph_mgr)),
+      cluster_flr(
+          new ClusterFunctionLibraryRuntime(this, !session_name.empty())),
+      device_mgr_(nullptr),
+      borrowed_device_mgr_(borrowed_device_mgr) {}
+
+WorkerSession::~WorkerSession() {
+  if (graph_mgr) {
+    Status s = graph_mgr->DeregisterAll();
+    if (!s.ok()) {
+      LOG(WARNING) << "Error during worker session deletion: " << s;
+    }
+  }
+}
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/worker_session.h b/tensorflow/core/distributed_runtime/worker_session.h
index 0fd19ac27f..f1faf49364 100644
--- a/tensorflow/core/distributed_runtime/worker_session.h
+++ b/tensorflow/core/distributed_runtime/worker_session.h
@@ -40,10 +40,14 @@ struct WorkerSession {
   // Object from which WorkerInterface instances can be obtained.
   const std::unique_ptr<WorkerCacheInterface> worker_cache;
 
-  // Collection of local devices. These devices are typically RenamedDevices
-  // in all except the SessionMgr.legacy_session_. legacy_session_.device_mgr
-  // == worker_env_.device_mgr, which holds the true devices.
-  const std::unique_ptr<DeviceMgr> device_mgr;
+  // Collection of local devices. These devices are typically
+  // RenamedDevices in all except the SessionMgr.legacy_session_ and
+  // sessions created with `isolate_session_state == false`. In the
+  // those cases, this method returns a pointer to a borrowed
+  // DeviceMgr (typically the `worker_env.device_mgr`).
+  DeviceMgr* device_mgr() {
+    return device_mgr_ ? device_mgr_.get() : borrowed_device_mgr_;
+  }
 
   // graph_mgr keeps track of the registered graphs of this session.
   //
@@ -57,6 +61,22 @@ struct WorkerSession {
                 std::unique_ptr<WorkerCacheInterface> worker_cache,
                 std::unique_ptr<DeviceMgr> device_mgr,
                 std::unique_ptr<GraphMgr> graph_mgr);
+
+  static std::shared_ptr<WorkerSession> CreateWithBorrowedDeviceMgr(
+      const string& session_name, const string& worker_name,
+      std::unique_ptr<WorkerCacheInterface> worker_cache,
+      DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr);
+
+  ~WorkerSession();
+
+ private:
+  WorkerSession(const string& session_name, const string& worker_name,
+                std::unique_ptr<WorkerCacheInterface> worker_cache,
+                DeviceMgr* borrowed_device_mgr,
+                std::unique_ptr<GraphMgr> graph_mgr);
+
+  const std::unique_ptr<DeviceMgr> device_mgr_;
+  DeviceMgr* const borrowed_device_mgr_;  // Not owned.
 };
 
 }  // namespace tensorflow
-- 
GitLab


From e77bb988e470d35aca3ea1e27a4f335409f1f4d2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 10:59:08 -0700
Subject: [PATCH 153/450] Fix open source BUILD bugs for cloud profiler.
 Increment version for releasing cloud_tpu_profiler 1.6 with pod profiling
 support.

PiperOrigin-RevId: 193541692
---
 .../tpu/profiler/capture_tpu_profile.cc       | 12 +++++-----
 .../pip_package/cloud_tpu_profiler/main.py    | 23 +++++++++++++++++--
 .../contrib/tpu/profiler/pip_package/setup.py |  2 +-
 tensorflow/contrib/tpu/profiler/version.h     |  2 +-
 4 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index a535884263..816897499b 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -41,7 +41,7 @@ namespace tensorflow {
 namespace tpu {
 namespace {
 
-using ::tensorflow::grpc::TPUProfileAnalysis;
+using ::tensorflow::TPUProfileAnalysis;
 using ::tensorflow::TPUProfiler;
 
 constexpr uint64 kMaxEvents = 1000000;
@@ -137,9 +137,9 @@ bool NewSession(const string& service_addr,
       PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
   new_session_request.set_repository_root(repository_root);
   new_session_request.set_session_id(session_id);
-  std::copy(
-      hostnames.begin(), hostnames.end(),
-      proto2::RepeatedFieldBackInserter(new_session_request.mutable_hosts()));
+  for (const auto& hostname : hostnames) {
+    new_session_request.add_hosts(hostname);
+  }
 
   ::grpc::ClientContext context;
   ::grpc::ChannelArguments channel_args;
@@ -159,8 +159,8 @@ bool NewSession(const string& service_addr,
   TF_QCHECK_OK(FromGrpcStatus(
       stub->NewSession(&context, new_session_request, &new_session_response)));
 
-  std::cout << "Profile session succeed for hosts:"
-            << str_util::Join(hostnames, ",");
+  std::cout << "Profile session succeed for host(s):"
+            << str_util::Join(hostnames, ",") << std::endl;
   return new_session_response.empty_trace();
 }
 
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
index 0b78cf8695..508c7a842f 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
@@ -37,12 +37,17 @@ flags.DEFINE_string(
     'will attempt to automatically detect the GCE project from metadata.')
 flags.DEFINE_string('tpu_name', None,
                     'Name of the Cloud TPU for Cluster Resolvers. You must '
-                    'specify either this flag or --master.')
+                    'specify either this flag or --service_addr.')
 
 # Tool specific parameters
 flags.DEFINE_string(
     'service_addr', None, 'Address of TPU profiler service e.g. '
     'localhost:8466, you must specify either this flag or --tpu_name.')
+flags.DEFINE_string(
+    'workers_list', None, 'The list of worker TPUs that we are about to profile'
+    ' e.g. 10.0.1.2, 10.0.1.3. You can specify this flag with --tpu_name or '
+    '--service_addr to profile a subset of tpu nodes. You can also use only'
+    '--tpu_name and leave this flag unspecified to profile all the tpus.')
 flags.DEFINE_string('logdir', None,
                     'Path of TensorBoard log directory e.g. /tmp/tb_log, '
                     'gs://tb_bucket')
@@ -56,18 +61,25 @@ flags.DEFINE_boolean('include_dataset_ops', True,
 
 FLAGS = flags.FLAGS
 EXECUTABLE = 'data/capture_tpu_profile'
+JOB_NAME = 'worker'
 
+def get_workers_list(cluster_resolver):
+  cluster_spec = cluster_resolver.cluster_spec()
+  task_indices = cluster_spec.task_indices(JOB_NAME)
+  workers_list = [cluster_spec.task_address(JOB_NAME, i).split(':')[0]
+                  for i in task_indices]
+  return ','.join(workers_list)
 
 def run_main():
   tf.app.run(main)
 
-
 def main(unused_argv=None):
   tf.logging.set_verbosity(tf.logging.INFO)
 
   if FLAGS.service_addr is None and FLAGS.tpu_name is None:
     sys.exit('You must specify either --service_addr or --tpu_name.')
 
+  tpu_cluster_resolver = None
   if FLAGS.service_addr is not None:
     if FLAGS.tpu_name is not None:
       tf.logging.warn('Both --service_addr and --tpu_name are set. Ignoring '
@@ -82,6 +94,12 @@ def main(unused_argv=None):
     service_addr = tpu_cluster_resolver.get_master()
   service_addr = service_addr.replace('grpc://', '').replace(':8470', ':8466')
 
+  workers_list = ""
+  if FLAGS.workers_list is not None:
+    workers_list = FLAGS.workers_list
+  elif tpu_cluster_resolver is not None:
+    workers_list = get_workers_list(tpu_cluster_resolver)
+
   if not FLAGS.logdir:
     sys.exit('logdir must be provided.')
   executable_path = os.path.join(os.path.dirname(__file__), EXECUTABLE)
@@ -89,6 +107,7 @@ def main(unused_argv=None):
   cmd = [executable_path]
   cmd.append('--logdir=' + logdir)
   cmd.append('--service_addr=' + service_addr)
+  cmd.append('--workers_list=' + workers_list)
   cmd.append('--duration_ms=' + str(FLAGS.duration_ms))
   cmd.append('--num_tracing_attempts=' + str(FLAGS.num_tracing_attempts))
   cmd.append('--include_dataset_ops=' + str(FLAGS.include_dataset_ops).lower())
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
index 8d99835b64..ebd478fd02 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from setuptools import setup
 
-_VERSION = '1.6.0-rc1'
+_VERSION = '1.6.0'
 
 CONSOLE_SCRIPTS = [
     'capture_tpu_profile=cloud_tpu_profiler.main:run_main',
diff --git a/tensorflow/contrib/tpu/profiler/version.h b/tensorflow/contrib/tpu/profiler/version.h
index dc6a934891..618479e1a6 100644
--- a/tensorflow/contrib/tpu/profiler/version.h
+++ b/tensorflow/contrib/tpu/profiler/version.h
@@ -16,6 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
 #define TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
 
-#define TPU_PROFILER_VERSION "1.5.0"
+#define TPU_PROFILER_VERSION "1.6.0"
 
 #endif  // TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
-- 
GitLab


From 62c3b7dece92a3ad1a39e7c4eb0894411e435258 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 11:08:08 -0700
Subject: [PATCH 154/450] Updating tests in constant_folding_test.cc so that
 they Evaluate the optimized and original graph and check if their outputs are
 the same.

PiperOrigin-RevId: 193543478
---
 .../optimizers/constant_folding_test.cc       | 52 +++++++++++++++++--
 1 file changed, 47 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 36625b68b7..1acce05909 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -689,8 +689,7 @@ TEST_F(ConstantFoldingTest, ControlDependencies) {
   GrapplerItem item;
   item.fetch.push_back("e");
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
-  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
-  EXPECT_EQ(1, tensors_expected.size());
+
   ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -717,9 +716,6 @@ TEST_F(ConstantFoldingTest, ControlDependencies) {
     }
   }
   EXPECT_EQ(1, found);
-  auto tensors = EvaluateNodes(output, item.fetch);
-  EXPECT_EQ(1, tensors.size());
-  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, ControlDependenciesEmptyFetch) {
@@ -995,6 +991,18 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationEmptyFetch) {
     }
   }
   EXPECT_EQ(3, found);
+
+  auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3}));
+  auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 7}));
+  auto v3_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({11, 13}));
+  std::vector<string> fetch_nodes = {"p2"};
+  auto tensors_expected = EvaluateNodes(
+      item.graph, fetch_nodes, {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, fetch_nodes,
+                               {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) {
@@ -1192,6 +1200,30 @@ TEST_F(ConstantFoldingTest, SwitchNodesEmptyFetch) {
     }
   }
   EXPECT_EQ(4, found);
+
+  auto v_in_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3}));
+  Tensor v_ctrl_t(DT_BOOL, TensorShape({}));
+
+  v_ctrl_t.flat<bool>()(0) = true;
+  std::vector<string> fetch_nodes = {"m", "m2"};
+  auto tensors_expected = EvaluateNodes(
+      item.graph, fetch_nodes, {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, fetch_nodes,
+                               {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
+
+  v_ctrl_t.flat<bool>()(0) = false;
+  tensors_expected = EvaluateNodes(item.graph, fetch_nodes,
+                                   {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors_expected.size());
+  tensors = EvaluateNodes(output, fetch_nodes,
+                          {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, SwitchNodes) {
@@ -1268,6 +1300,16 @@ TEST_F(ConstantFoldingTest, SwitchNodes) {
   EXPECT_EQ(2, tensors.size());
   test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
   test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
+
+  v_ctrl_t.flat<bool>()(0) = false;
+  tensors_expected = EvaluateNodes(item.graph, item.fetch,
+                                   {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors_expected.size());
+  tensors = EvaluateNodes(output, item.fetch,
+                          {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, MergeNodes) {
-- 
GitLab


From 9b496c9134529f6d85f0e9757099104cf506cbd6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 11:21:21 -0700
Subject: [PATCH 155/450] Update ops-related pbtxt files.

PiperOrigin-RevId: 193546050
---
 tensorflow/core/ops/compat/ops_history.v1.pbtxt | 15 +++++++++++++++
 tensorflow/core/ops/ops.pbtxt                   | 15 +++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 9bc11cf0fe..dbd6f859c4 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -15829,6 +15829,21 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "DatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+}
 op {
   name: "DebugGradientIdentity"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 9b665190ce..46afe357f0 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -7051,6 +7051,21 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "DatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+}
 op {
   name: "DebugGradientIdentity"
   input_arg {
-- 
GitLab


From 87229e4fc3bc23c7a92bfdf40e5834ac65a00d34 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 11:47:28 -0700
Subject: [PATCH 156/450] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 193550428

---
 tensorflow/go/op/wrappers.go | 72 ++++++++++++++++++------------------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 35ad1eff0f..3b3dff0573 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -3105,6 +3105,42 @@ func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output)
 	return op.Output(0)
 }
 
+// Return a tensor with the same shape and contents as the input tensor or value.
+func Identity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Identity",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
+//
+// This is the angle \( \theta \in [-\pi, \pi] \) such that
+// \[ x = r \cos(\theta) \]
+// and
+// \[ y = r \sin(\theta) \]
+// where \(r = \sqrt(x^2 + y^2) \).
+func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atan2",
+		Input: []tf.Input{
+			y, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a dataset that passes a sliding window over `input_dataset`.
 //
 // Arguments:
@@ -25383,42 +25419,6 @@ func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ..
 	return op.Output(0)
 }
 
-// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
-//
-// This is the angle \( \theta \in [-\pi, \pi] \) such that
-// \[ x = r \cos(\theta) \]
-// and
-// \[ y = r \sin(\theta) \]
-// where \(r = \sqrt(x^2 + y^2) \).
-func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atan2",
-		Input: []tf.Input{
-			y, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Return a tensor with the same shape and contents as the input tensor or value.
-func Identity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Identity",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Gather slices from `params` axis `axis` according to `indices`.
 //
 // `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-- 
GitLab


From 78db5136edf30667090988c703f98f4f8c4c4269 Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Thu, 19 Apr 2018 11:52:10 -0700
Subject: [PATCH 157/450] Implements linear_model using _LinearModel. Added
 support for cols_to_vars in _LinearModel in order to make this possible.
 Also, made some fixes so that variable names come out the same as before.

PiperOrigin-RevId: 193551353
---
 .../python/feature_column/feature_column.py   | 106 ++++++++--------
 .../feature_column/feature_column_test.py     | 117 ++++++++++++------
 .../training/warm_starting_util_test.py       |  16 +--
 3 files changed, 138 insertions(+), 101 deletions(-)

diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 0ad8131599..87a52f8441 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -409,58 +409,19 @@ def linear_model(features,
     ValueError: if an item in `feature_columns` is neither a `_DenseColumn`
       nor `_CategoricalColumn`.
   """
-  feature_columns = _clean_feature_columns(feature_columns)
-  for column in feature_columns:
-    if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
-      raise ValueError('Items of feature_columns must be either a _DenseColumn '
-                       'or _CategoricalColumn. Given: {}'.format(column))
-  weight_collections = list(weight_collections or [])
-  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
-    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
-  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
-    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
-  with variable_scope.variable_scope(
-      None, default_name='linear_model', values=features.values()):
-    weighted_sums = []
-    ordered_columns = []
-    builder = _LazyBuilder(features)
-    for column in sorted(feature_columns, key=lambda x: x.name):
-      with variable_scope.variable_scope(
-          None, default_name=column._var_scope_name):  # pylint: disable=protected-access
-        ordered_columns.append(column)
-        weighted_sum = _create_weighted_sum(
-            column=column,
-            builder=builder,
-            units=units,
-            sparse_combiner=sparse_combiner,
-            weight_collections=weight_collections,
-            trainable=trainable)
-        weighted_sums.append(weighted_sum)
-        if cols_to_vars is not None:
-          # Retrieve the variables created.
-          cols_to_vars[column] = ops.get_collection(
-              ops.GraphKeys.GLOBAL_VARIABLES,
-              scope=variable_scope.get_variable_scope().name)
-    _verify_static_batch_size_equality(weighted_sums, ordered_columns)
-    predictions_no_bias = math_ops.add_n(
-        weighted_sums, name='weighted_sum_no_bias')
-    bias = variable_scope.get_variable(
-        'bias_weights',
-        shape=[units],
-        initializer=init_ops.zeros_initializer(),
-        trainable=trainable,
-        collections=weight_collections)
-    predictions = nn_ops.bias_add(
-        predictions_no_bias, bias, name='weighted_sum')
-    if cols_to_vars is not None:
-      # Add the bias to cols_to_vars as well, converting the Variable or
-      # PartitionedVariable to a list of Variable's.
-      if (isinstance(bias, variables.Variable) or
-          resource_variable_ops.is_resource_variable(bias)):
-        cols_to_vars['bias'] = [bias]
-      else:  # Must be a PartitionedVariable.
-        cols_to_vars['bias'] = list(bias)
-    return predictions
+  linear_model_layer = _LinearModel(
+      feature_columns=feature_columns,
+      units=units,
+      sparse_combiner=sparse_combiner,
+      weight_collections=weight_collections,
+      trainable=trainable,
+      name='linear_model')
+  retval = linear_model_layer(features)  # pylint: disable=not-callable
+  if cols_to_vars is None:
+    return retval
+  for k, v in linear_model_layer.cols_to_vars().items():
+    cols_to_vars[k] = v
+  return retval
 
 
 def _add_to_collections(var, weight_collections):
@@ -551,8 +512,22 @@ class _BiasLayer(base.Layer):
     return self._bias_variable
 
 
+def _get_expanded_variable_list(variable):
+  if (isinstance(variable, variables.Variable) or
+      resource_variable_ops.is_resource_variable(variable)):
+    return [variable]  # Single variable case.
+  else:  # Must be a PartitionedVariable, so convert into a list.
+    return list(variable)
+
+
+def _strip_leading_slashes(name):
+  return name.rsplit('/', 1)[-1]
+
+
 class _LinearModel(training.Model):
   """Creates a linear model using feature columns.
+
+  See `linear_model` for details.
   """
 
   def __init__(self,
@@ -573,7 +548,10 @@ class _LinearModel(training.Model):
     for column in sorted(self._feature_columns, key=lambda x: x.name):
       with variable_scope.variable_scope(
           None, default_name=column._var_scope_name) as vs:  # pylint: disable=protected-access
-        column_name = vs.name
+        # Having the fully expressed variable scope name ends up doubly
+        # expressing the outer scope (scope with which this method was called)
+        # in the name of the variable that would get created.
+        column_name = _strip_leading_slashes(vs.name)
       column_layer = _FCLinearWrapper(column, units, sparse_combiner,
                                       self._weight_collections, trainable,
                                       column_name, **kwargs)
@@ -585,6 +563,15 @@ class _LinearModel(training.Model):
         weight_collections=self._weight_collections,
         name='bias_layer',
         **kwargs)
+    self._cols_to_vars = {}
+
+  def cols_to_vars(self):
+    """Returns a dict mapping _FeatureColumns to variables.
+
+    See `linear_model` for more information.
+    This is not populated till `call` is called i.e. layer is built.
+    """
+    return self._cols_to_vars
 
   def call(self, features):
     with variable_scope.variable_scope(self.name):
@@ -597,15 +584,24 @@ class _LinearModel(training.Model):
       ordered_columns = []
       builder = _LazyBuilder(features)
       for layer in sorted(self._column_layers.values(), key=lambda x: x.name):
-        ordered_columns.append(layer._feature_column)  # pylint: disable=protected-access
+        column = layer._feature_column  # pylint: disable=protected-access
+        ordered_columns.append(column)
         weighted_sum = layer(builder)
         weighted_sums.append(weighted_sum)
+        self._cols_to_vars[column] = ops.get_collection(
+            ops.GraphKeys.GLOBAL_VARIABLES, scope=layer.scope_name)
 
       _verify_static_batch_size_equality(weighted_sums, ordered_columns)
       predictions_no_bias = math_ops.add_n(
           weighted_sums, name='weighted_sum_no_bias')
       predictions = nn_ops.bias_add(
-          predictions_no_bias, self._bias_layer(builder), name='weighted_sum')  # pylint: disable=not-callable
+          predictions_no_bias,
+          self._bias_layer(  # pylint: disable=not-callable
+              builder,
+              scope=variable_scope.get_variable_scope()),  # pylint: disable=not-callable
+          name='weighted_sum')
+      bias = self._bias_layer.variables[0]
+      self._cols_to_vars['bias'] = _get_expanded_variable_list(bias)
     return predictions
 
   def _add_layers(self, layers):
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 46404abadc..49e06b8245 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -345,7 +345,7 @@ class NumericColumnTest(test.TestCase):
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = get_keras_linear_model_predictions(features, [price])
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
@@ -584,7 +584,7 @@ class BucketizedColumnTest(test.TestCase):
       features = {'price': [[-1.], [1.], [5.], [6.]]}
       predictions = get_keras_linear_model_predictions(features,
                                                        [bucketized_price])
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
@@ -610,7 +610,7 @@ class BucketizedColumnTest(test.TestCase):
       features = {'price': [[-1., 1.], [5., 6.]]}
       predictions = get_keras_linear_model_predictions(features,
                                                        [bucketized_price])
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
@@ -849,7 +849,7 @@ class HashedCategoricalColumnTest(test.TestCase):
                   values=('marlo', 'skywalker', 'omar'),
                   dense_shape=(2, 2))
       }, (wire_column,))
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
         self.assertAllClose((0.,), bias.eval())
@@ -1171,7 +1171,7 @@ class CrossedColumnTest(test.TestCase):
                   values=['cA', 'cB', 'cC'],
                   dense_shape=(2, 2)),
       }, (crossed,))
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       crossed_var = get_linear_model_column_var(crossed)
       with _initialized_session() as sess:
         self.assertAllClose((0.,), bias.eval())
@@ -1254,18 +1254,13 @@ def get_linear_model_column_var(column):
                             'linear_model/' + column.name)[0]
 
 
-def get_keras_linear_model_bias():
-  with variable_scope.variable_scope('linear_model', reuse=True):
-    with variable_scope.variable_scope('bias_layer', reuse=True):
-      return variable_scope.get_variable('bias_weights')
-
-
 def get_keras_linear_model_predictions(features,
                                        feature_columns,
                                        units=1,
                                        sparse_combiner='sum',
                                        weight_collections=None,
-                                       trainable=True):
+                                       trainable=True,
+                                       cols_to_vars=None):
   keras_linear_model = _LinearModel(
       feature_columns,
       units,
@@ -1273,7 +1268,12 @@ def get_keras_linear_model_predictions(features,
       weight_collections,
       trainable,
       name='linear_model')
-  return keras_linear_model(features)  # pylint: disable=not-callable
+  retval = keras_linear_model(features)  # pylint: disable=not-callable
+  if cols_to_vars is None:
+    return retval
+  for k, v in keras_linear_model.cols_to_vars().items():
+    cols_to_vars[k] = v
+  return retval
 
 
 @test_util.with_c_api
@@ -1977,7 +1977,7 @@ class _LinearModelTest(test.TestCase):
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = get_keras_linear_model_predictions(features, [price])
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
@@ -1994,7 +1994,7 @@ class _LinearModelTest(test.TestCase):
           dense_shape=[2, 2])
       features = {'wire_cast': wire_tensor}
       predictions = get_keras_linear_model_predictions(features, [wire_cast])
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
@@ -2014,7 +2014,7 @@ class _LinearModelTest(test.TestCase):
       features = {'wire_cast': wire_tensor, 'price': [[1.], [5.]]}
       predictions = get_keras_linear_model_predictions(features,
                                                        [wire_cast, price])
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
@@ -2072,7 +2072,7 @@ class _LinearModelTest(test.TestCase):
       features = {dense_and_sparse_column.name: sp_tensor}
       predictions = get_keras_linear_model_predictions(
           features, [dense_and_sparse_column])
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       dense_and_sparse_column_var = get_linear_model_column_var(
           dense_and_sparse_column)
       with _initialized_session() as sess:
@@ -2088,7 +2088,7 @@ class _LinearModelTest(test.TestCase):
       features = {'price': [[1.], [5.]]}
       predictions = get_keras_linear_model_predictions(
           features, [price], units=3)
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
         self.assertAllClose(np.zeros((3,)), bias.eval())
@@ -2108,7 +2108,7 @@ class _LinearModelTest(test.TestCase):
       features = {'wire_cast': wire_tensor}
       predictions = get_keras_linear_model_predictions(
           features, [wire_cast], units=3)
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
         self.assertAllClose(np.zeros((3,)), bias.eval())
@@ -2163,7 +2163,7 @@ class _LinearModelTest(test.TestCase):
       features = {'wire_cast': wire_tensor}
       predictions = get_keras_linear_model_predictions(
           features, [wire_cast], sparse_combiner='mean')
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
@@ -2176,7 +2176,7 @@ class _LinearModelTest(test.TestCase):
       features = {'price': [[1., 2.], [5., 6.]]}
       predictions = get_keras_linear_model_predictions(
           features, [price], units=3)
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
         self.assertAllClose(np.zeros((3,)), bias.eval())
@@ -2206,7 +2206,7 @@ class _LinearModelTest(test.TestCase):
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
       predictions = get_keras_linear_model_predictions(features, [price])
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
@@ -2222,7 +2222,7 @@ class _LinearModelTest(test.TestCase):
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       predictions = get_keras_linear_model_predictions(features,
                                                        [price1, price2])
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       price1_var = get_linear_model_column_var(price1)
       price2_var = get_linear_model_column_var(price2)
       with _initialized_session() as sess:
@@ -2235,6 +2235,45 @@ class _LinearModelTest(test.TestCase):
         sess.run(bias.assign([7.]))
         self.assertAllClose([[3217.], [4657.]], predictions.eval())
 
+  def test_fills_cols_to_vars(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
+      cols_to_vars = {}
+      get_keras_linear_model_predictions(
+          features, [price1, price2], cols_to_vars=cols_to_vars)
+      bias = get_linear_model_bias()
+      price1_var = get_linear_model_column_var(price1)
+      price2_var = get_linear_model_column_var(price2)
+      self.assertAllEqual(cols_to_vars['bias'], [bias])
+      self.assertAllEqual(cols_to_vars[price1], [price1_var])
+      self.assertAllEqual(cols_to_vars[price2], [price2_var])
+
+  def test_fills_cols_to_vars_partitioned_variables(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2', shape=3)
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1., 2.], [6., 7.]],
+          'price2': [[3., 4., 5.], [8., 9., 10.]]
+      }
+      cols_to_vars = {}
+      with variable_scope.variable_scope(
+          'linear',
+          partitioner=partitioned_variables.fixed_size_partitioner(2, axis=0)):
+        get_keras_linear_model_predictions(
+            features, [price1, price2], cols_to_vars=cols_to_vars)
+      with _initialized_session():
+        self.assertEqual([0.], cols_to_vars['bias'][0].eval())
+        # Partitioning shards the [2, 1] price1 var into 2 [1, 1] Variables.
+        self.assertAllEqual([[0.]], cols_to_vars[price1][0].eval())
+        self.assertAllEqual([[0.]], cols_to_vars[price1][1].eval())
+        # Partitioning shards the [3, 1] price2 var into a [2, 1] Variable and
+        # a [1, 1] Variable.
+        self.assertAllEqual([[0.], [0.]], cols_to_vars[price2][0].eval())
+        self.assertAllEqual([[0.]], cols_to_vars[price2][1].eval())
+
   def test_dense_collection(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default() as g:
@@ -2242,7 +2281,7 @@ class _LinearModelTest(test.TestCase):
       get_keras_linear_model_predictions(
           features, [price], weight_collections=['my-vars'])
       my_vars = g.get_collection('my-vars')
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       self.assertIn(bias, my_vars)
       self.assertIn(price_var, my_vars)
@@ -2256,7 +2295,7 @@ class _LinearModelTest(test.TestCase):
       get_keras_linear_model_predictions(
           features, [wire_cast], weight_collections=['my-vars'])
       my_vars = g.get_collection('my-vars')
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       self.assertIn(bias, my_vars)
       self.assertIn(wire_cast_var, my_vars)
@@ -2266,7 +2305,7 @@ class _LinearModelTest(test.TestCase):
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       get_keras_linear_model_predictions(features, [price])
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       self.assertIn(bias, trainable_vars)
@@ -2280,7 +2319,7 @@ class _LinearModelTest(test.TestCase):
       features = {'wire_cast': wire_tensor}
       get_keras_linear_model_predictions(features, [wire_cast])
       trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       self.assertIn(bias, trainable_vars)
       self.assertIn(wire_cast_var, trainable_vars)
@@ -2427,7 +2466,7 @@ class _LinearModelTest(test.TestCase):
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
 
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       price_buckets_var = get_linear_model_column_var(price_buckets)
       body_style_var = get_linear_model_column_var(body_style)
 
@@ -2470,7 +2509,7 @@ class _LinearModelTest(test.TestCase):
     net = get_keras_linear_model_predictions(features,
                                              [price_buckets, body_style])
     with _initialized_session() as sess:
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       price_buckets_var = get_linear_model_column_var(price_buckets)
       body_style_var = get_linear_model_column_var(body_style)
 
@@ -2509,7 +2548,7 @@ class _LinearModelTest(test.TestCase):
 
     net = get_keras_linear_model_predictions(
         features, [price_buckets, body_style, country])
-    bias = get_keras_linear_model_bias()
+    bias = get_linear_model_bias()
     price_buckets_var = get_linear_model_column_var(price_buckets)
     body_style_var = get_linear_model_column_var(body_style)
     with _initialized_session() as sess:
@@ -3688,7 +3727,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
                   values=('marlo', 'skywalker', 'omar'),
                   dense_shape=(2, 2))
       }, (wire_column,))
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
         self.assertAllClose((0.,), bias.eval())
@@ -4080,7 +4119,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
                   values=('marlo', 'skywalker', 'omar'),
                   dense_shape=(2, 2))
       }, (wire_column,))
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
         self.assertAllClose((0.,), bias.eval())
@@ -4326,7 +4365,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
                   values=(0, 2, 1),
                   dense_shape=(2, 2))
       }, (column,))
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
         self.assertAllClose((0.,), bias.eval())
@@ -5108,7 +5147,7 @@ class EmbeddingColumnTest(test.TestCase):
           categorical_column.name: sparse_input
       }, (embedding_column,))
       expected_var_names = (
-          'linear_model/bias_layer/bias_weights:0',
+          'linear_model/bias_weights:0',
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
@@ -5120,7 +5159,7 @@ class EmbeddingColumnTest(test.TestCase):
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
       self.assertItemsEqual(expected_var_names, trainable_vars.keys())
-      bias = trainable_vars['linear_model/bias_layer/bias_weights:0']
+      bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
       linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
@@ -5757,7 +5796,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
       # Linear weights do not follow the column name. But this is a rare use
       # case, and fixing it would add too much complexity to the code.
       expected_var_names = (
-          'linear_model/bias_layer/bias_weights:0',
+          'linear_model/bias_weights:0',
           'linear_model/aaa_bbb_shared_embedding/weights:0',
           'linear_model/aaa_bbb_shared_embedding/embedding_weights:0',
           'linear_model/aaa_bbb_shared_embedding_1/weights:0',
@@ -5770,7 +5809,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
       self.assertItemsEqual(expected_var_names, trainable_vars.keys())
-      bias = trainable_vars['linear_model/bias_layer/bias_weights:0']
+      bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_bbb_shared_embedding/embedding_weights:0']
       linear_weights_a = trainable_vars[
@@ -6105,7 +6144,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
                   values=(.5, 1., .1),
                   dense_shape=(2, 2))
       }, (column,))
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
         self.assertAllClose((0.,), bias.eval())
@@ -6172,7 +6211,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
                   dense_shape=(2, 2)),
           'values': ((.5,), (1.,), (.1,))
       }, (column,))
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
         self.assertAllClose((0.,), bias.eval())
diff --git a/tensorflow/python/training/warm_starting_util_test.py b/tensorflow/python/training/warm_starting_util_test.py
index 6e445d8bd1..7e8cbd6bae 100644
--- a/tensorflow/python/training/warm_starting_util_test.py
+++ b/tensorflow/python/training/warm_starting_util_test.py
@@ -946,18 +946,20 @@ class WarmStartingUtilTest(test.TestCase):
         # emb_vocab should be correctly warm-started after vocab remapping.
         # Missing values are filled in with the EmbeddingColumn's initializer.
         self._assert_cols_to_vars(
-            cols_to_vars, {
+            cols_to_vars,
+            {
                 emb_vocab: [
-                    # embedding_weights part 0.
-                    np.array([[3., 3.3], [2., 2.2], [1., 1.1]]),
-                    # embedding_weights part 1.
-                    np.array([[0.5, 0.4], [0.42, 0.42], [0.42, 0.42]]),
                     # linear weights part 0.
                     np.array([[0.69]]),
                     # linear weights part 1.
-                    np.array([[0.71]])
+                    np.array([[0.71]]),
+                    # embedding_weights part 0.
+                    np.array([[3., 3.3], [2., 2.2], [1., 1.1]]),
+                    # embedding_weights part 1.
+                    np.array([[0.5, 0.4], [0.42, 0.42], [0.42, 0.42]])
                 ]
-            }, sess)
+            },
+            sess)
 
   def testErrorConditions(self):
     x = variable_scope.get_variable(
-- 
GitLab


From 173aadc6b62dd95691257c2d9f158dd9044bb4ef Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 11:55:46 -0700
Subject: [PATCH 158/450] Change estimator to only log non-binary eval metrics,
 because logging binary metrics such as images will lead to crash.

PiperOrigin-RevId: 193551927
---
 tensorflow/python/estimator/estimator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index a42b6cfee8..9862fdecdb 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -1256,7 +1256,8 @@ def _dict_to_str(dictionary):
     A `str` representing the `dictionary`.
   """
   return ', '.join('%s = %s' % (k, v)
-                   for k, v in sorted(six.iteritems(dictionary)))
+                   for k, v in sorted(six.iteritems(dictionary))
+                   if not isinstance(v, six.binary_type))
 
 
 def _write_dict_to_summary(output_dir,
-- 
GitLab


From fb02b02689b0e126c93cbcb8462e8417e1d954cc Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Thu, 19 Apr 2018 11:57:36 -0700
Subject: [PATCH 159/450] Avoid looking up the shape functions multiple times
 Improved the handling of fed nodes

PiperOrigin-RevId: 193552210
---
 .../core/grappler/costs/graph_properties.cc   | 155 +++++++++---------
 .../core/grappler/costs/graph_properties.h    |   7 -
 2 files changed, 78 insertions(+), 84 deletions(-)

diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index c83ddfe90a..dd2d53dfdf 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -395,8 +395,11 @@ class TopoQueue {
 // unknown shape/dimension of a given node.
 class SymbolicShapeRefiner {
  public:
-  explicit SymbolicShapeRefiner(const GraphDef& graph)
-      : function_library_(OpRegistry::Global(), graph.library()) {
+  explicit SymbolicShapeRefiner(
+      const GraphDef& graph,
+      const std::unordered_map<string, std::unordered_set<int>>& fed_ports)
+      : function_library_(OpRegistry::Global(), graph.library()),
+        fed_ports_(fed_ports) {
     graph_def_version_ = graph.versions().producer();
     node_to_context_.reserve(graph.node_size());
   }
@@ -704,6 +707,9 @@ class SymbolicShapeRefiner {
     std::vector<ShapeHandle> input_tensors_as_shapes;
 
     NodeContext& node_ctx = node_to_context_[node];
+    TF_RETURN_IF_ERROR(
+        function_library_.LookUp(node->type_string(), &node_ctx.op_data));
+
     node_ctx.inference_context.reset(new InferenceContext(
         graph_def_version_, &node->def(), node->op_def(), input_shapes,
         input_tensors, input_tensors_as_shapes,
@@ -716,6 +722,7 @@ class SymbolicShapeRefiner {
   }
 
   struct NodeContext {
+    const OpRegistrationData* op_data;
     std::unique_ptr<InferenceContext> inference_context;
     std::vector<ShapeHandle> output_tensors_as_shapes;
   };
@@ -723,65 +730,80 @@ class SymbolicShapeRefiner {
   Status InferShapes(const Node* node, NodeContext* c) {
     InferenceContext* ic = c->inference_context.get();
 
-    // Propagate shape tensors
-    if (node->type_string() == "Shape") {
-      c->output_tensors_as_shapes.resize(1);
-      c->output_tensors_as_shapes[0] = c->inference_context->input(0);
-    } else if (node->type_string() == "ShapeN") {
-      c->output_tensors_as_shapes.resize(c->inference_context->num_inputs());
-      for (int i = 0; i < c->inference_context->num_inputs(); ++i) {
-        c->output_tensors_as_shapes[i] = c->inference_context->input(i);
-      }
-    } else if (node->type_string() == "ConcatV2") {
-      bool valid = true;
-      ShapeHandle result;
-      for (int i = 0; i < ic->num_inputs() - 1; ++i) {
-        ShapeHandle input = ic->input_tensors_as_shapes()[i];
-        if (!ic->RankKnown(input)) {
-          valid = false;
-          break;
-        } else if (i == 0) {
-          result = input;
-        } else {
-          TF_RETURN_IF_ERROR(ic->Concatenate(result, input, &result));
-        }
-      }
-      if (valid) {
+    auto it = fed_ports_.find(node->name());
+    const bool is_fed = it != fed_ports_.end();
+
+    // Propagate shape tensors unless the node is fed.
+    // TODO(bsteiner) We should still propagate the shapes to the ports that
+    // aren't fed in the case of a ShapeN node.
+    if (!is_fed) {
+      if (node->type_string() == "Shape") {
         c->output_tensors_as_shapes.resize(1);
-        c->output_tensors_as_shapes[0] = result;
-      }
-    } else if (node->type_string() == "Slice") {
-      ShapeHandle input = ic->input_tensors_as_shapes()[0];
-      bool valid = ic->RankKnown(input);
-      const Tensor* slice_offset = ic->input_tensor(1);
-      valid &= slice_offset != nullptr && slice_offset->NumElements() == 1;
-      const Tensor* slice_size = ic->input_tensor(2);
-      valid &= slice_size != nullptr && slice_size->NumElements() == 1;
-      if (valid) {
-        int64 start = slice_offset->dtype() == DT_INT32
-                          ? slice_offset->flat<int32>()(0)
-                          : slice_offset->flat<int64>()(0);
-        int64 end = start + (slice_size->dtype() == DT_INT32
-                                 ? slice_size->flat<int32>()(0)
-                                 : slice_size->flat<int64>()(0));
+        c->output_tensors_as_shapes[0] = c->inference_context->input(0);
+      } else if (node->type_string() == "ShapeN") {
+        c->output_tensors_as_shapes.resize(c->inference_context->num_inputs());
+        for (int i = 0; i < c->inference_context->num_inputs(); ++i) {
+          c->output_tensors_as_shapes[i] = c->inference_context->input(i);
+        }
+      } else if (node->type_string() == "ConcatV2") {
+        bool valid = true;
         ShapeHandle result;
-        TF_RETURN_IF_ERROR(ic->Subshape(input, start, end, &result));
-        c->output_tensors_as_shapes.resize(1);
-        c->output_tensors_as_shapes[0] = result;
+        for (int i = 0; i < ic->num_inputs() - 1; ++i) {
+          ShapeHandle input = ic->input_tensors_as_shapes()[i];
+          if (!ic->RankKnown(input)) {
+            valid = false;
+            break;
+          } else if (i == 0) {
+            result = input;
+          } else {
+            TF_RETURN_IF_ERROR(ic->Concatenate(result, input, &result));
+          }
+        }
+        if (valid) {
+          c->output_tensors_as_shapes.resize(1);
+          c->output_tensors_as_shapes[0] = result;
+        }
+      } else if (node->type_string() == "Slice") {
+        ShapeHandle input = ic->input_tensors_as_shapes()[0];
+        bool valid = ic->RankKnown(input);
+        const Tensor* slice_offset = ic->input_tensor(1);
+        valid &= slice_offset != nullptr && slice_offset->NumElements() == 1;
+        const Tensor* slice_size = ic->input_tensor(2);
+        valid &= slice_size != nullptr && slice_size->NumElements() == 1;
+        if (valid) {
+          int64 start = slice_offset->dtype() == DT_INT32
+                            ? slice_offset->flat<int32>()(0)
+                            : slice_offset->flat<int64>()(0);
+          int64 end = start + (slice_size->dtype() == DT_INT32
+                                   ? slice_size->flat<int32>()(0)
+                                   : slice_size->flat<int64>()(0));
+          ShapeHandle result;
+          TF_RETURN_IF_ERROR(ic->Subshape(input, start, end, &result));
+          c->output_tensors_as_shapes.resize(1);
+          c->output_tensors_as_shapes[0] = result;
+        }
       }
     }
 
     // Infer the shapes of output tensors.
-    const OpRegistrationData* op_reg_data;
-    Status s = function_library_.default_registry()->LookUp(node->type_string(),
-                                                            &op_reg_data);
-    if (!s.ok() || op_reg_data->shape_inference_fn == nullptr) {
+    if (!c->op_data || c->op_data->shape_inference_fn == nullptr) {
       // There is nothing more we can infer, annotate outputs with unknown
       // shapes
       return c->inference_context->Run(shape_inference::UnknownShape);
     }
 
-    return c->inference_context->Run(op_reg_data->shape_inference_fn);
+    TF_RETURN_IF_ERROR(
+        c->inference_context->Run(c->op_data->shape_inference_fn));
+
+    Status status = Status::OK();
+    if (is_fed) {
+      // It is possible to feed node output ports with tensors of any shape: as
+      // a result, the shape of a fed port is completely unknown.
+      for (const int output_port : it->second) {
+        status.Update(SetUnknownShape(node, output_port));
+      }
+    }
+    return status;
   }
 
   NodeContext* GetNodeContext(const Node* node) {
@@ -797,6 +819,7 @@ class SymbolicShapeRefiner {
   std::unordered_map<ShapeId, ShapeHandle, HashShapeId> unknown_shapes_;
   std::unordered_map<DimId, DimensionHandle, HashDimId> unknown_dims_;
   FunctionLibraryDefinition function_library_;
+  const std::unordered_map<string, std::unordered_set<int>>& fed_ports_;
 };
 
 // Keep track of shapes and dimensions in a graph.
@@ -983,23 +1006,6 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
   return Status::OK();
 }
 
-Status GraphProperties::OverwriteFedPorts(
-    SymbolicShapeRefiner* shape_refiner,
-    const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
-    const Node* node, bool* new_shapes) const {
-  auto it = fed_ports.find(node->name());
-  Status status;
-  if (it != fed_ports.end()) {
-    // It is possible to feed node output ports with tensors of any shape: as a
-    // result, the shape of a fed port is completely unknown.
-    for (const int output_port : it->second) {
-      status.Update(shape_refiner->SetUnknownShape(node, output_port));
-    }
-    *new_shapes = true;
-  }
-  return status;
-}
-
 // Manually propagate the input shape for Enter nodes and update any Merge node
 // outputs.
 Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner,
@@ -1032,7 +1038,6 @@ Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner,
 
 Status GraphProperties::UpdateShapes(
     SymbolicShapeRefiner* shape_refiner, bool relax,
-    const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
     const Node* n, bool* new_shapes) const {
   if (n->IsEnter()) {
     // The Enter shape function always forwards an UnknownShape, so do the right
@@ -1053,9 +1058,7 @@ Status GraphProperties::UpdateShapes(
       }
     }
   }
-  // Nodes can be fed with any shape. The TensorFlow shape inference code can't
-  // handle this properly, so overwrite its behavior here.
-  return OverwriteFedPorts(shape_refiner, fed_ports, n, new_shapes);
+  return Status::OK();
 }
 
 // Propagates the shapes in the transitive fan-out of <new_shapes>.
@@ -1063,7 +1066,6 @@ Status GraphProperties::PropagateShapes(
     SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes,
     const std::unordered_map<const Node*, std::unordered_set<const Node*>>&
         resources,
-    const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
     int num_loops) const {
   // Limit the number of iterations to prevent infinite loops in the presence of
   // incorrect shape functions. The algoritm should converge in at most
@@ -1087,8 +1089,7 @@ Status GraphProperties::PropagateShapes(
            num_loop_iterations++ < max_loop_iterations) {
       const Node* n = new_shapes->pop();
       bool updated = false;
-      TF_RETURN_IF_ERROR(
-          UpdateShapes(shape_refiner, relax, fed_ports, n, &updated));
+      TF_RETURN_IF_ERROR(UpdateShapes(shape_refiner, relax, n, &updated));
       if (updated) {
         for (const Edge* e : n->out_edges()) {
           if (!e->IsControlEdge()) {
@@ -1243,7 +1244,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
     }
   }
 
-  SymbolicShapeRefiner refiner(item_.graph);
+  SymbolicShapeRefiner refiner(item_.graph, fed_ports);
 
   // We propagate shapes through the graph in two phases. In the first phase, we
   // exclusively merge shapes but we do not propagate shapes through the
@@ -1267,8 +1268,8 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
       new_shapes.push(node);
     }
     // Propagate shapes normally.
-    TF_RETURN_IF_ERROR(PropagateShapes(&refiner, relax, &new_shapes, resources,
-                                       fed_ports, num_loops));
+    TF_RETURN_IF_ERROR(
+        PropagateShapes(&refiner, relax, &new_shapes, resources, num_loops));
   }
 
   // Track shapes globally across the graph.
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index 30351f58fd..4c3f3f5f53 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -102,16 +102,10 @@ class GraphProperties {
   // Process the Enter node, and enqueue its fanout in new_shapes if needed.
   static Status UpdateEnter(SymbolicShapeRefiner* shape_refiner,
                             const Node* node, bool relax, bool* new_shapes);
-  // Process a node that is used to feed the model.
-  Status OverwriteFedPorts(
-      SymbolicShapeRefiner* shape_refiner,
-      const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
-      const Node* node, bool* new_shapes) const;
   // Update the shapes for node 'n'. If output shapes for n have changed,
   // enqueue its fanout in 'new_shapes'.
   Status UpdateShapes(
       SymbolicShapeRefiner* shape_refiner, bool relax,
-      const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
       const Node* n, bool* new_shapes) const;
   // Propagate the shapes for the nodes enqueued in new_shapes and their
   // transitive fanout until a fixed point is reached.
@@ -119,7 +113,6 @@ class GraphProperties {
       SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes,
       const std::unordered_map<const Node*, std::unordered_set<const Node*>>&
           resources,
-      const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
       int num_loops) const;
 
   // Data members
-- 
GitLab


From 0ea0049fa500078c132ed29b60beb8831de26dbb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 11:57:48 -0700
Subject: [PATCH 160/450] Internal cleanup.

PiperOrigin-RevId: 193552240
---
 .../java/org/tensorflow/lite/DataType.java    | 12 ++-
 .../java/org/tensorflow/lite/Interpreter.java | 19 +++--
 .../lite/NativeInterpreterWrapper.java        | 21 +++---
 .../main/java/org/tensorflow/lite/Tensor.java |  7 +-
 .../java/src/main/native/exception_jni.cc     |  3 +-
 .../native/nativeinterpreterwrapper_jni.cc    | 74 +++++++++++--------
 .../lite/java/src/main/native/tensor_jni.cc   | 35 +++++----
 .../lite/NativeInterpreterWrapperTest.java    |  6 +-
 8 files changed, 102 insertions(+), 75 deletions(-)

diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java
index fc16488a64..75334cd96e 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java
@@ -51,7 +51,11 @@ enum DataType {
       }
     }
     throw new IllegalArgumentException(
-        "DataType " + c + " is not recognized in Java (version " + TensorFlowLite.version() + ")");
+        "DataType error: DataType "
+            + c
+            + " is not recognized in Java (version "
+            + TensorFlowLite.version()
+            + ")");
   }
 
   /** Returns byte size of the type. */
@@ -68,7 +72,8 @@ enum DataType {
       case BYTEBUFFER:
         return 1;
     }
-    throw new IllegalArgumentException("DataType " + this + " is not supported yet");
+    throw new IllegalArgumentException(
+        "DataType error: DataType " + this + " is not supported yet");
   }
 
   /** Gets string names of the data type. */
@@ -85,7 +90,8 @@ enum DataType {
       case BYTEBUFFER:
         return "ByteBuffer";
     }
-    throw new IllegalArgumentException("DataType " + this + " is not supported yet");
+    throw new IllegalArgumentException(
+        "DataType error: DataType " + this + " is not supported yet");
   }
 
   // Cached to avoid copying it
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index a33959dca4..e915e65aa1 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -137,17 +137,19 @@ public final class Interpreter implements AutoCloseable {
   public void runForMultipleInputsOutputs(
       @NonNull Object[] inputs, @NonNull Map<Integer, Object> outputs) {
     if (wrapper == null) {
-      throw new IllegalStateException("The Interpreter has already been closed.");
+      throw new IllegalStateException("Internal error: The Interpreter has already been closed.");
     }
     Tensor[] tensors = wrapper.run(inputs);
     if (outputs == null || tensors == null || outputs.size() > tensors.length) {
-      throw new IllegalArgumentException("Outputs do not match with model outputs.");
+      throw new IllegalArgumentException("Output error: Outputs do not match with model outputs.");
     }
     final int size = tensors.length;
     for (Integer idx : outputs.keySet()) {
       if (idx == null || idx < 0 || idx >= size) {
         throw new IllegalArgumentException(
-            String.format("Invalid index of output %d (should be in range [0, %d))", idx, size));
+            String.format(
+                "Output error: Invalid index of output %d (should be in range [0, %d))",
+                idx, size));
       }
       tensors[idx].copyTo(outputs.get(idx));
     }
@@ -160,7 +162,7 @@ public final class Interpreter implements AutoCloseable {
    */
   public void resizeInput(int idx, @NonNull int[] dims) {
     if (wrapper == null) {
-      throw new IllegalStateException("The Interpreter has already been closed.");
+      throw new IllegalStateException("Internal error: The Interpreter has already been closed.");
     }
     wrapper.resizeInput(idx, dims);
   }
@@ -173,7 +175,7 @@ public final class Interpreter implements AutoCloseable {
    */
   public int getInputIndex(String opName) {
     if (wrapper == null) {
-      throw new IllegalStateException("The Interpreter has already been closed.");
+      throw new IllegalStateException("Internal error: The Interpreter has already been closed.");
     }
     return wrapper.getInputIndex(opName);
   }
@@ -186,7 +188,7 @@ public final class Interpreter implements AutoCloseable {
    */
   public int getOutputIndex(String opName) {
     if (wrapper == null) {
-      throw new IllegalStateException("The Interpreter has already been closed.");
+      throw new IllegalStateException("Internal error: The Interpreter has already been closed.");
     }
     return wrapper.getOutputIndex(opName);
   }
@@ -198,7 +200,7 @@ public final class Interpreter implements AutoCloseable {
    */
   public Long getLastNativeInferenceDurationNanoseconds() {
     if (wrapper == null) {
-      throw new IllegalStateException("The interpreter has already been closed.");
+      throw new IllegalStateException("Internal error: The interpreter has already been closed.");
     }
     return wrapper.getLastNativeInferenceDurationNanoseconds();
   }
@@ -208,7 +210,8 @@ public final class Interpreter implements AutoCloseable {
     if (wrapper != null) {
       wrapper.setUseNNAPI(useNNAPI);
     } else {
-      throw new IllegalStateException("NativeInterpreterWrapper has already been closed.");
+      throw new IllegalStateException(
+          "Internal error: NativeInterpreterWrapper has already been closed.");
     }
   }
 
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index fc8187acfe..dfc8ac111a 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -80,7 +80,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
   /** Sets inputs, runs model inference and returns outputs. */
   Tensor[] run(Object[] inputs) {
     if (inputs == null || inputs.length == 0) {
-      throw new IllegalArgumentException("Invalid inputs. Inputs should not be null or empty.");
+      throw new IllegalArgumentException("Input error: Inputs should not be null or empty.");
     }
     int[] dataTypes = new int[inputs.length];
     Object[] sizes = new Object[inputs.length];
@@ -92,7 +92,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
         ByteBuffer buffer = (ByteBuffer) inputs[i];
         if (buffer.order() != ByteOrder.nativeOrder()) {
           throw new IllegalArgumentException(
-              "Invalid ByteBuffer. It shoud use ByteOrder.nativeOrder().");
+              "Input error: ByteBuffer shoud use ByteOrder.nativeOrder().");
         }
         numsOfBytes[i] = buffer.limit();
         sizes[i] = getInputDims(interpreterHandle, i, numsOfBytes[i]);
@@ -103,7 +103,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
       } else {
         throw new IllegalArgumentException(
             String.format(
-                "%d-th element of the %d inputs is not an array or a ByteBuffer.",
+                "Input error: %d-th element of the %d inputs is not an array or a ByteBuffer.",
                 i, inputs.length));
       }
     }
@@ -119,7 +119,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
             this,
             isMemoryAllocated);
     if (outputsHandles == null || outputsHandles.length == 0) {
-      throw new IllegalStateException("Interpreter has no outputs.");
+      throw new IllegalStateException("Internal error: Interpreter has no outputs.");
     }
     isMemoryAllocated = true;
     Tensor[] outputs = new Tensor[outputsHandles.length];
@@ -169,7 +169,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     } else {
       throw new IllegalArgumentException(
           String.format(
-              "%s is not a valid name for any input. The indexes of the inputs are %s",
+              "Input error: %s is not a valid name for any input. "
+                  + "The indexes of the inputs are %s",
               name, inputsIndexes.toString()));
     }
   }
@@ -190,7 +191,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     } else {
       throw new IllegalArgumentException(
           String.format(
-              "%s is not a valid name for any output. The indexes of the outputs are %s",
+              "Input error: %s is not a valid name for any output. "
+                  + "The indexes of the outputs are %s",
               name, outputsIndexes.toString()));
     }
   }
@@ -229,7 +231,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
         return DataType.BYTEBUFFER;
       }
     }
-    throw new IllegalArgumentException("cannot resolve DataType of " + o.getClass().getName());
+    throw new IllegalArgumentException(
+        "DataType error: cannot resolve DataType of " + o.getClass().getName());
   }
 
   /** Returns the shape of an object as an int array. */
@@ -245,7 +248,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
       return 0;
     }
     if (Array.getLength(o) == 0) {
-      throw new IllegalArgumentException("array lengths cannot be 0.");
+      throw new IllegalArgumentException("Array lengths cannot be 0.");
     }
     return 1 + numDimensions(Array.get(o, 0));
   }
@@ -259,7 +262,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
       shape[dim] = len;
     } else if (shape[dim] != len) {
       throw new IllegalArgumentException(
-          String.format("mismatched lengths (%d and %d) in dimension %d", shape[dim], len, dim));
+          String.format("Mismatched lengths (%d and %d) in dimension %d", shape[dim], len, dim));
     }
     for (int i = 0; i < len; ++i) {
       fillShape(Array.get(o, i), dim + 1, shape);
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
index 54ace6c63c..09e887aae3 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
@@ -34,15 +34,16 @@ final class Tensor {
     if (NativeInterpreterWrapper.dataTypeOf(dst) != dtype) {
       throw new IllegalArgumentException(
           String.format(
-              "Cannot convert an TensorFlowLite tensor with type %s to a Java object of "
-                  + "type %s (which is compatible with the TensorFlowLite type %s)",
+              "Output error: Cannot convert an TensorFlowLite tensor with type %s to a Java "
+                  + "object of type %s (which is compatible with the TensorFlowLite type %s)",
               dtype, dst.getClass().getName(), NativeInterpreterWrapper.dataTypeOf(dst)));
     }
     int[] dstShape = NativeInterpreterWrapper.shapeOf(dst);
     if (!Arrays.equals(dstShape, shapeCopy)) {
       throw new IllegalArgumentException(
           String.format(
-              "Shape of output target %s does not match with the shape of the Tensor %s.",
+              "Output error: Shape of output target %s does not match with the shape of the "
+                  + "Tensor %s.",
               Arrays.toString(dstShape), Arrays.toString(shapeCopy)));
     }
     readMultiDimensionalArray(nativeHandle, dst);
diff --git a/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc b/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc
index 1578c9e3dd..34d91be04c 100644
--- a/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc
@@ -44,7 +44,8 @@ BufferErrorReporter::BufferErrorReporter(JNIEnv* env, int limit) {
   buffer_ = new char[limit];
   if (!buffer_) {
     throwException(env, kNullPointerException,
-                   "Malloc of BufferErrorReporter to hold %d char failed.",
+                   "Internal error: Malloc of BufferErrorReporter to hold %d "
+                   "char failed.",
                    limit);
     return;
   }
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index 844226203b..ccfdfd829b 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -22,7 +22,7 @@ const int kBufferSize = 256;
 tflite::Interpreter* convertLongToInterpreter(JNIEnv* env, jlong handle) {
   if (handle == 0) {
     throwException(env, kIllegalArgumentException,
-                   "Invalid handle to Interpreter.");
+                   "Internal error: Invalid handle to Interpreter.");
     return nullptr;
   }
   return reinterpret_cast<tflite::Interpreter*>(handle);
@@ -30,7 +30,8 @@ tflite::Interpreter* convertLongToInterpreter(JNIEnv* env, jlong handle) {
 
 tflite::FlatBufferModel* convertLongToModel(JNIEnv* env, jlong handle) {
   if (handle == 0) {
-    throwException(env, kIllegalArgumentException, "Invalid handle to model.");
+    throwException(env, kIllegalArgumentException,
+                   "Internal error: Invalid handle to model.");
     return nullptr;
   }
   return reinterpret_cast<tflite::FlatBufferModel*>(handle);
@@ -39,7 +40,7 @@ tflite::FlatBufferModel* convertLongToModel(JNIEnv* env, jlong handle) {
 BufferErrorReporter* convertLongToErrorReporter(JNIEnv* env, jlong handle) {
   if (handle == 0) {
     throwException(env, kIllegalArgumentException,
-                   "Invalid handle to ErrorReporter.");
+                   "Internal error: Invalid handle to ErrorReporter.");
     return nullptr;
   }
   return reinterpret_cast<BufferErrorReporter*>(handle);
@@ -51,7 +52,7 @@ std::vector<int> convertJIntArrayToVector(JNIEnv* env, jintArray inputs) {
   jint* ptr = env->GetIntArrayElements(inputs, nullptr);
   if (ptr == nullptr) {
     throwException(env, kIllegalArgumentException,
-                   "Empty dimensions of input array.");
+                   "Array has empty dimensions.");
     return {};
   }
   for (int i = 0; i < size; ++i) {
@@ -113,7 +114,7 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter,
                          jobjectArray sizes) {
   if (input_size != interpreter->inputs().size()) {
     throwException(env, kIllegalArgumentException,
-                   "Expected num of inputs is %d but got %d",
+                   "Input error: Expected num of inputs is %d but got %d",
                    interpreter->inputs().size(), input_size);
     return kTfLiteError;
   }
@@ -121,8 +122,9 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter,
       input_size != env->GetArrayLength(nums_of_bytes) ||
       input_size != env->GetArrayLength(values)) {
     throwException(env, kIllegalArgumentException,
-                   "Arrays in arguments should be of the same length, but got "
-                   "%d sizes, %d data_types, %d nums_of_bytes, and %d values",
+                   "Internal error: Arrays in arguments should be of the same "
+                   "length, but got %d sizes, %d data_types, %d nums_of_bytes, "
+                   "and %d values",
                    input_size, env->GetArrayLength(data_types),
                    env->GetArrayLength(nums_of_bytes),
                    env->GetArrayLength(values));
@@ -136,8 +138,8 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter,
     int num_dims = static_cast<int>(env->GetArrayLength(dims));
     if (target->dims->size != num_dims) {
       throwException(env, kIllegalArgumentException,
-                     "%d-th input should have %d dimensions, but found %d "
-                     "dimensions",
+                     "Input error: %d-th input should have %d dimensions, but "
+                     "found %d dimensions",
                      i, target->dims->size, num_dims);
       return kTfLiteError;
     }
@@ -150,7 +152,8 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter,
                   num_dims);
         printDims(obtained_dims.get(), kBufferSize, ptr, num_dims);
         throwException(env, kIllegalArgumentException,
-                       "%d-th input dimension should be [%s], but found [%s]",
+                       "Input error: %d-th input dimension should be [%s], but "
+                       "found [%s]",
                        i, expected_dims.get(), obtained_dims.get());
         env->ReleaseIntArrayElements(dims, ptr, JNI_ABORT);
         return kTfLiteError;
@@ -236,8 +239,8 @@ TfLiteStatus setInputs(JNIEnv* env, tflite::Interpreter* interpreter,
       TfLiteType type = resolveDataType(data_type[i]);
       if (type != target->type) {
         throwException(env, kIllegalArgumentException,
-                       "DataType (%d) of input data does not match with the "
-                       "DataType (%d) of model inputs.",
+                       "Input error: DataType (%d) of input data does not "
+                       "match with the DataType (%d) of model inputs.",
                        type, target->type);
         return kTfLiteError;
       }
@@ -270,7 +273,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputNames(JNIEnv* env,
   jclass string_class = env->FindClass("java/lang/String");
   if (string_class == nullptr) {
     throwException(env, kUnsupportedOperationException,
-                   "Can not find java/lang/String class to get input names.");
+                   "Internal error: Can not find java/lang/String class to get "
+                   "input names.");
     return nullptr;
   }
   size_t size = interpreter->inputs().size();
@@ -292,7 +296,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputNames(JNIEnv* env,
   jclass string_class = env->FindClass("java/lang/String");
   if (string_class == nullptr) {
     throwException(env, kUnsupportedOperationException,
-                   "Can not find java/lang/String class to get output names.");
+                   "Internal error: Can not find java/lang/String class to get "
+                   "output names.");
     return nullptr;
   }
   size_t size = interpreter->outputs().size();
@@ -351,8 +356,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModel(
       path, verifier.get(), error_reporter);
   if (!model) {
     throwException(env, kIllegalArgumentException,
-                   "Contents of %s does not encode a valid TensorFlowLite "
-                   "model: %s",
+                   "Contents of %s does not encode a valid "
+                   "TensorFlowLite model: %s",
                    path, error_reporter->CachedErrorMessage());
     env->ReleaseStringUTFChars(model_file, path);
     return 0;
@@ -380,8 +385,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer(
       buf, static_cast<size_t>(capacity), error_reporter);
   if (!model) {
     throwException(env, kIllegalArgumentException,
-                   "MappedByteBuffer does not encode a valid TensorFlowLite "
-                   "model: %s",
+                   "MappedByteBuffer does not encode a valid "
+                   "TensorFlowLite model: %s",
                    error_reporter->CachedErrorMessage());
     return 0;
   }
@@ -403,7 +408,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
       &interpreter, static_cast<int>(num_threads));
   if (status != kTfLiteOk) {
     throwException(env, kIllegalArgumentException,
-                   "Cannot create interpreter: %s",
+                   "Internal error: Cannot create interpreter: %s",
                    error_reporter->CachedErrorMessage());
     return 0;
   }
@@ -411,7 +416,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
   status = interpreter->AllocateTensors();
   if (status != kTfLiteOk) {
     throwException(env, kNullPointerException,
-                   "Can not allocate memory for the interpreter",
+                   "Internal error: Cannot allocate memory for the interpreter",
                    error_reporter->CachedErrorMessage());
     return 0;
   }
@@ -440,7 +445,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_run(
     // resizes inputs
     status = resizeInputs(env, interpreter, input_size, sizes);
     if (status != kTfLiteOk) {
-      throwException(env, kNullPointerException, "Can not resize the input: %s",
+      throwException(env, kNullPointerException,
+                     "Internal error: Can not resize the input: %s",
                      error_reporter->CachedErrorMessage());
       return nullptr;
     }
@@ -448,7 +454,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_run(
     status = interpreter->AllocateTensors();
     if (status != kTfLiteOk) {
       throwException(env, kNullPointerException,
-                     "Can not allocate memory for the given inputs: %s",
+                     "Internal error: Can not allocate memory for the given "
+                     "inputs: %s",
                      error_reporter->CachedErrorMessage());
       return nullptr;
     }
@@ -461,7 +468,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_run(
   // runs inference
   if (interpreter->Invoke() != kTfLiteOk) {
     throwException(env, kIllegalArgumentException,
-                   "Failed to run on the given Interpreter: %s",
+                   "Internal error: Failed to run on the given Interpreter: %s",
                    error_reporter->CachedErrorMessage());
     return nullptr;
   }
@@ -479,8 +486,9 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_run(
   // returns outputs
   const std::vector<int>& results = interpreter->outputs();
   if (results.empty()) {
-    throwException(env, kIllegalArgumentException,
-                   "The Interpreter does not have any outputs.");
+    throwException(
+        env, kIllegalArgumentException,
+        "Internal error: The Interpreter does not have any outputs.");
     return nullptr;
   }
   jlongArray outputs = env->NewLongArray(results.size());
@@ -501,7 +509,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputDims(
   const int idx = static_cast<int>(input_idx);
   if (input_idx < 0 || input_idx >= interpreter->inputs().size()) {
     throwException(env, kIllegalArgumentException,
-                   "Out of range: Failed to get %d-th input out of %d inputs",
+                   "Input error: Out of range: Failed to get %d-th input out of"
+                   " %d inputs",
                    input_idx, interpreter->inputs().size());
     return nullptr;
   }
@@ -514,8 +523,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputDims(
     }
     if (num_bytes != expected_num_bytes) {
       throwException(env, kIllegalArgumentException,
-                     "Failed to get input dimensions. %d-th input should have"
-                     " %d bytes, but found %d bytes.",
+                     "Input error: Failed to get input dimensions. %d-th input "
+                     "should have %d bytes, but found %d bytes.",
                      idx, expected_num_bytes, num_bytes);
       return nullptr;
     }
@@ -533,8 +542,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputDataType(
   const int idx = static_cast<int>(output_idx);
   if (output_idx < 0 || output_idx >= interpreter->outputs().size()) {
     throwException(env, kIllegalArgumentException,
-                   "Out of range: Failed to get %d-th output out of %d outputs",
-                   output_idx, interpreter->outputs().size());
+                   "Failed to get %d-th output out of %d outputs", output_idx,
+                   interpreter->outputs().size());
     return -1;
   }
   TfLiteTensor* target = interpreter->tensor(interpreter->outputs()[idx]);
@@ -555,7 +564,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_resizeInput(
   const int idx = static_cast<int>(input_idx);
   if (idx < 0 || idx >= interpreter->inputs().size()) {
     throwException(env, kIllegalArgumentException,
-                   "Can not resize %d-th input for a model having %d inputs.",
+                   "Input error: Can not resize %d-th input for a model having "
+                   "%d inputs.",
                    idx, interpreter->inputs().size());
     return JNI_FALSE;
   }
@@ -567,7 +577,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_resizeInput(
         interpreter->inputs()[idx], convertJIntArrayToVector(env, dims));
     if (status != kTfLiteOk) {
       throwException(env, kIllegalArgumentException,
-                     "Failed to resize %d-th input: %s", idx,
+                     "Internal error: Failed to resize %d-th input: %s", idx,
                      error_reporter->CachedErrorMessage());
       return JNI_FALSE;
     }
diff --git a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
index 65126e78a3..17f4be09c6 100644
--- a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
@@ -23,7 +23,7 @@ namespace {
 TfLiteTensor* convertLongToTensor(JNIEnv* env, jlong handle) {
   if (handle == 0) {
     throwException(env, kIllegalArgumentException,
-                   "Invalid handle to TfLiteTensor.");
+                   "Internal error: Invalid handle to TfLiteTensor.");
     return nullptr;
   }
   return reinterpret_cast<TfLiteTensor*>(handle);
@@ -36,7 +36,8 @@ size_t writeOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type,
   size_t to_copy = num_elements * elementByteSize(type);
   if (to_copy > dst_size) {
     throwException(env, kIllegalStateException,
-                   "cannot write Java array of %d bytes to Tensor of %d bytes",
+                   "Internal error: cannot write Java array of %d bytes to "
+                   "Tensor of %d bytes",
                    to_copy, dst_size);
     return 0;
   }
@@ -71,10 +72,10 @@ size_t writeOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type,
     }
     default: {
       throwException(env, kUnsupportedOperationException,
-                     "TensorFlowLite currently supports float (32 bits), "
-                     "int (32 bits), byte (8 bits), and long (64 bits), "
-                     "support for other types (DataType %d in this case) will "
-                     "be added in the future",
+                     "DataType error: TensorFlowLite currently supports float "
+                     "(32 bits), int (32 bits), byte (8 bits), and long "
+                     "(64 bits), support for other types (DataType %d in this "
+                     "case) will be added in the future",
                      kTfLiteFloat32, type);
       return 0;
     }
@@ -88,8 +89,9 @@ size_t readOneDimensionalArray(JNIEnv* env, TfLiteType data_type,
   if (size > src_size) {
     throwException(
         env, kIllegalStateException,
-        "cannot fill a Java array of %d bytes with a Tensor of %d bytes", size,
-        src_size);
+        "Internal error: cannot fill a Java array of %d bytes with a Tensor of "
+        "%d bytes",
+        size, src_size);
     return 0;
   }
   switch (data_type) {
@@ -117,8 +119,8 @@ size_t readOneDimensionalArray(JNIEnv* env, TfLiteType data_type,
       return size;
     }
     default: {
-      throwException(env, kIllegalStateException, "invalid DataType(%d)",
-                     data_type);
+      throwException(env, kIllegalStateException,
+                     "DataType error: invalid DataType(%d)", data_type);
     }
   }
   return 0;
@@ -152,19 +154,22 @@ size_t elementByteSize(TfLiteType data_type) {
   switch (data_type) {
     case kTfLiteFloat32:
       static_assert(sizeof(jfloat) == 4,
-                    "Java float not compatible with kTfLiteFloat");
+                    "Interal error: Java float not compatible with "
+                    "kTfLiteFloat");
       return 4;
     case kTfLiteInt32:
       static_assert(sizeof(jint) == 4,
-                    "Java int not compatible with kTfLiteInt");
+                    "Interal error: Java int not compatible with kTfLiteInt");
       return 4;
     case kTfLiteUInt8:
       static_assert(sizeof(jbyte) == 1,
-                    "Java byte not compatible with kTfLiteUInt8");
+                    "Interal error: Java byte not compatible with "
+                    "kTfLiteUInt8");
       return 1;
     case kTfLiteInt64:
       static_assert(sizeof(jlong) == 8,
-                    "Java long not compatible with kTfLiteInt64");
+                    "Interal error: Java long not compatible with "
+                    "kTfLiteInt64");
       return 8;
     default:
       return 0;
@@ -212,7 +217,7 @@ Java_org_tensorflow_lite_Tensor_readMultiDimensionalArray(JNIEnv* env,
   int num_dims = tensor->dims->size;
   if (num_dims == 0) {
     throwException(env, kIllegalArgumentException,
-                   "copyTo() is not meant for scalar Tensors.");
+                   "Internal error: Cannot copy empty/scalar Tensors.");
     return;
   }
   readMultiDimensionalArray(env, tensor->type, tensor->data.raw, tensor->bytes,
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
index dbe45e5a05..7c00d3196f 100644
--- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
@@ -321,9 +321,7 @@ public final class NativeInterpreterWrapperTest {
       wrapper.run(inputs);
       fail();
     } catch (IllegalArgumentException e) {
-      assertThat(e)
-          .hasMessageThat()
-          .contains("Invalid inputs. Inputs should not be null or empty.");
+      assertThat(e).hasMessageThat().contains("Inputs should not be null or empty.");
     }
     wrapper.close();
   }
@@ -440,7 +438,7 @@ public final class NativeInterpreterWrapperTest {
       NativeInterpreterWrapper.numDimensions(emptyArray);
       fail();
     } catch (IllegalArgumentException e) {
-      assertThat(e).hasMessageThat().contains("array lengths cannot be 0.");
+      assertThat(e).hasMessageThat().contains("Array lengths cannot be 0.");
     }
   }
 
-- 
GitLab


From 16d25e8c8a9ebb6500d3b3418ca8c2bb80c3e42e Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Thu, 19 Apr 2018 11:58:04 -0700
Subject: [PATCH 161/450] Add support for Dataset Iterators in Model
 training/eval methods in graph mode.

PiperOrigin-RevId: 193552275
---
 tensorflow/python/keras/BUILD                 |   1 +
 .../keras/_impl/keras/engine/training.py      | 195 ++++++++++++------
 .../_impl/keras/engine/training_arrays.py     |  12 +-
 .../keras/_impl/keras/engine/training_test.py |  84 +++++++-
 .../api/golden/tensorflow.keras.-model.pbtxt  |   4 +-
 .../golden/tensorflow.keras.-sequential.pbtxt |   4 +-
 .../tensorflow.keras.models.-model.pbtxt      |   4 +-
 .../tensorflow.keras.models.-sequential.pbtxt |   4 +-
 8 files changed, 223 insertions(+), 85 deletions(-)

diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index ca7686b1d1..70040b7e74 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -175,6 +175,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":backend",
+        "//tensorflow/python/data",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index 012d9ceea4..146e8fdac9 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -634,12 +636,20 @@ class Model(Network):
     This is a purely internal method, subject to refactoring at any time.
 
     Args:
-      x: An array or list of arrays, to be used as input data. If the model
-       has known, named inputs, this could also be a dict mapping input names
-       to the corresponding array.
-      y: An array or list of arrays, to be used as target data. If the model
-       has known, named outputs, this could also be a dict mapping output names
-       to the corresponding array.
+      x: Input data. It could be:
+        - A Numpy array (or array-like), or a list of arrays
+          (in case the model has multiple inputs).
+        - A TensorFlow tensor, or a list of tensors
+          (in case the model has multiple inputs).
+        - A dict mapping input names to the corresponding array/tensors,
+          if the model has named inputs.
+        - A `tf.data` dataset iterator.
+      y: Target data. Like the input data `x`,
+        it could be either Numpy array(s) or TensorFlow tensor(s).
+        It should be consistent with `x` (you cannot have Numpy inputs and
+        tensor targets, or inversely). If `x` is a dataset iterator,
+        `y` should not be specified
+        (since targets will be obtained from the iterator).
       sample_weight: An optional sample-weight array passed by the user to
         weight the importance of each sample in `x`.
       class_weight: An optional class-weight array by the user to
@@ -659,6 +669,31 @@ class Model(Network):
       RuntimeError: If the model was never compiled.
     """
     # First, we build/compile the model on the fly if necessary.
+    if isinstance(x, dataset_ops.Dataset):
+      raise ValueError('You passed a `Dataset` instance to your model (%s), '
+                       'which is not supported. Instead, pass an `Iterator`, '
+                       'which you can obtain e.g. via '
+                       '`dataset.make_one_shot_iterator()` (the exact method '
+                       'to use will depend on your specific dataset).' % x)
+    if isinstance(x, iterator_ops.Iterator):
+      if y is not None:
+        raise ValueError('You passed a dataset iterator (%s) as input `x` to '
+                         'your model. In that case, you should not specify '
+                         'a target (`y`) argument, since the dataset iterator '
+                         'generates both input data and target data. '
+                         'Received: %s' % (x, y))
+      if not context.executing_eagerly():
+        x, y = x.get_next()
+        # TODO(fchollet): handle case of `get_next` not returning 2 tensors?
+      else:
+        # TODO(psv): implement this. The way to support it will be to typecheck
+        # for `iterator` before `_standardize_user_data` is called and redirect
+        # to new training/eval functions in `training_eager.py`. The model
+        # may need to get built using the specs of the data from the first batch
+        # drawn from the iterator.
+        raise ValueError('Dataset iterators are not supported '
+                         'with eager execution yet.')
+
     all_inputs = []
     if not self.built:
       # We need to use `x` to set the model inputs.
@@ -1016,22 +1051,26 @@ class Model(Network):
     """Trains the model for a fixed number of epochs (iterations on a dataset).
 
     Arguments:
-        x: Numpy array of training data (if the model has a single input),
-            or list of Numpy arrays (if the model has multiple inputs).
-            If input layers in the model are named, you can also pass a
-            dictionary mapping input names to Numpy arrays.
-            `x` can be `None` (default) if feeding from
-            TensorFlow data tensors.
-        y: Numpy array of target (label) data
-            (if the model has a single output),
-            or list of Numpy arrays (if the model has multiple outputs).
-            If output layers in the model are named, you can also pass a
-            dictionary mapping output names to Numpy arrays.
-            `y` can be `None` (default) if feeding from
-            TensorFlow data tensors.
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+            if the model has named inputs.
+          - A `tf.data` dataset iterator.
+        y: Target data. Like the input data `x`,
+          it could be either Numpy array(s) or TensorFlow tensor(s).
+          It should be consistent with `x` (you cannot have Numpy inputs and
+          tensor targets, or inversely). If `x` is a dataset iterator,
+          `y` should not be specified
+          (since targets will be obtained from the iterator).
         batch_size: Integer or `None`.
             Number of samples per gradient update.
             If unspecified, `batch_size` will default to 32.
+            Do not specify the `batch_size` is your data is in the
+            form of symbolic tensors or dataset iterators (since they generate
+            batches).
         epochs: Integer. Number of epochs to train the model.
             An epoch is an iteration over the entire `x` and `y`
             data provided.
@@ -1053,11 +1092,14 @@ class Model(Network):
             on this data at the end of each epoch.
             The validation data is selected from the last samples
             in the `x` and `y` data provided, before shuffling.
-        validation_data: tuple `(x_val, y_val)` or tuple
-            `(x_val, y_val, val_sample_weights)` on which to evaluate
+        validation_data: Data on which to evaluate
             the loss and any model metrics at the end of each epoch.
             The model will not be trained on this data.
             `validation_data` will override `validation_split`.
+            `validation_data` could be:
+              - tuple `(x_val, y_val)` of Numpy arrays or tensors
+              - tuple `(x_val, y_val, val_sample_weights)` of Numpy arrays
+              - dataset iterator
         shuffle: Boolean (whether to shuffle the training data
             before each epoch) or str (for 'batch').
             'batch' is a special option for dealing with the
@@ -1134,17 +1176,22 @@ class Model(Network):
         batch_size=batch_size)
     # Prepare validation data.
     if validation_data:
-      if len(validation_data) == 2:
+      if isinstance(validation_data, iterator_ops.Iterator):
+        val_x = validation_data
+        val_y = None
+        val_sample_weight = None
+      elif len(validation_data) == 2:
         val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
         val_sample_weight = None
       elif len(validation_data) == 3:
         val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
       else:
         raise ValueError(
-            'When passing validation_data, '
-            'it must contain 2 (x_val, y_val) '
-            'or 3 (x_val, y_val, val_sample_weights) '
-            'items, however it contains %d items' % len(validation_data))
+            'When passing a `validation_data` argument, '
+            'it must contain either 2 items (x_val, y_val), '
+            'or 3 items (x_val, y_val, val_sample_weights), '
+            'or alternatively it could be a dataset iterator. However we '
+            'received `validation_data=%s`' % validation_data)
 
       val_x, val_y, val_sample_weights = self._standardize_user_data(
           val_x,
@@ -1218,22 +1265,26 @@ class Model(Network):
     Computation is done in batches.
 
     Arguments:
-        x: Numpy array of test data (if the model has a single input),
-            or list of Numpy arrays (if the model has multiple inputs).
-            If input layers in the model are named, you can also pass a
-            dictionary mapping input names to Numpy arrays.
-            `x` can be `None` (default) if feeding from
-            TensorFlow data tensors.
-        y: Numpy array of target (label) data
-            (if the model has a single output),
-            or list of Numpy arrays (if the model has multiple outputs).
-            If output layers in the model are named, you can also pass a
-            dictionary mapping output names to Numpy arrays.
-            `y` can be `None` (default) if feeding from
-            TensorFlow data tensors.
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+            if the model has named inputs.
+          - A `tf.data` dataset iterator.
+        y: Target data. Like the input data `x`,
+          it could be either Numpy array(s) or TensorFlow tensor(s).
+          It should be consistent with `x` (you cannot have Numpy inputs and
+          tensor targets, or inversely). If `x` is a dataset iterator,
+          `y` should not be specified
+          (since targets will be obtained from the iterator).
         batch_size: Integer or `None`.
-            Number of samples per evaluation step.
+            Number of samples per gradient update.
             If unspecified, `batch_size` will default to 32.
+            Do not specify the `batch_size` is your data is in the
+            form of symbolic tensors or dataset iterators (since they generate
+            batches).
         verbose: 0 or 1. Verbosity mode.
             0 = silent, 1 = progress bar.
         sample_weight: Optional Numpy array of weights for
@@ -1291,9 +1342,13 @@ class Model(Network):
     Computation is done in batches.
 
     Arguments:
-        x: The input data, as a Numpy array
-            (or list of Numpy arrays if the model has multiple outputs).
-        batch_size: Integer. If unspecified, it will default to 32.
+        x: Input samples, as Numpy array(s) or tensor(s).
+        batch_size: Integer or `None`.
+            Number of samples per gradient update.
+            If unspecified, `batch_size` will default to 32.
+            Do not specify the `batch_size` is your data is in the
+            form of symbolic tensors or dataset iterators (since they generate
+            batches).
         verbose: Verbosity mode, 0 or 1.
         steps: Total number of steps (batches of samples)
             before declaring the prediction round finished.
@@ -1324,20 +1379,24 @@ class Model(Network):
       return training_arrays.predict_loop(
           self, x, batch_size=batch_size, verbose=verbose, steps=steps)
 
-  def train_on_batch(self, x, y, sample_weight=None, class_weight=None):
+  def train_on_batch(self, x, y=None, sample_weight=None, class_weight=None):
     """Runs a single gradient update on a single batch of data.
 
     Arguments:
-        x: Numpy array of training data,
-            or list of Numpy arrays if the model has multiple inputs.
-            If all inputs in the model are named,
-            you can also pass a dictionary
-            mapping input names to Numpy arrays.
-        y: Numpy array of target data,
-            or list of Numpy arrays if the model has multiple outputs.
-            If all outputs in the model are named,
-            you can also pass a dictionary
-            mapping output names to Numpy arrays.
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+            if the model has named inputs.
+          - A `tf.data` dataset iterator.
+        y: Target data. Like the input data `x`,
+          it could be either Numpy array(s) or TensorFlow tensor(s).
+          It should be consistent with `x` (you cannot have Numpy inputs and
+          tensor targets, or inversely). If `x` is a dataset iterator,
+          `y` should not be specified
+          (since targets will be obtained from the iterator).
         sample_weight: Optional array of the same length as x, containing
             weights to apply to the model's loss for each sample.
             In the case of temporal data, you can pass a 2D array
@@ -1384,20 +1443,24 @@ class Model(Network):
       return outputs[0]
     return outputs
 
-  def test_on_batch(self, x, y, sample_weight=None):
+  def test_on_batch(self, x, y=None, sample_weight=None):
     """Test the model on a single batch of samples.
 
     Arguments:
-        x: Numpy array of test data,
-            or list of Numpy arrays if the model has multiple inputs.
-            If all inputs in the model are named,
-            you can also pass a dictionary
-            mapping input names to Numpy arrays.
-        y: Numpy array of target data,
-            or list of Numpy arrays if the model has multiple outputs.
-            If all outputs in the model are named,
-            you can also pass a dictionary
-            mapping output names to Numpy arrays.
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+            if the model has named inputs.
+          - A `tf.data` dataset iterator.
+        y: Target data. Like the input data `x`,
+          it could be either Numpy array(s) or TensorFlow tensor(s).
+          It should be consistent with `x` (you cannot have Numpy inputs and
+          tensor targets, or inversely). If `x` is a dataset iterator,
+          `y` should not be specified
+          (since targets will be obtained from the iterator).
         sample_weight: Optional array of the same length as x, containing
             weights to apply to the model's loss for each sample.
             In the case of temporal data, you can pass a 2D array
@@ -1437,7 +1500,7 @@ class Model(Network):
     """Returns predictions for a single batch of samples.
 
     Arguments:
-        x: Input samples, as a Numpy array.
+        x: Input samples, as Numpy array(s) or tensor(s).
 
     Returns:
         Numpy array(s) of predictions.
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_arrays.py b/tensorflow/python/keras/_impl/keras/engine/training_arrays.py
index 18116e3a14..4164cae864 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_arrays.py
@@ -23,6 +23,7 @@ import copy
 
 import numpy as np
 
+from tensorflow.python.framework import errors
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import callbacks as cbks
 from tensorflow.python.keras._impl.keras.engine import training_utils
@@ -30,6 +31,7 @@ from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
 from tensorflow.python.keras._impl.keras.utils.generic_utils import make_batches
 from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
 from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.platform import tf_logging as logging
 
 try:
   from scipy.sparse import issparse  # pylint: disable=g-import-not-at-top
@@ -190,7 +192,15 @@ def fit_loop(model,
         batch_logs['batch'] = step_index
         batch_logs['size'] = 1
         callbacks.on_batch_begin(step_index, batch_logs)
-        outs = f(ins)
+        try:
+          outs = f(ins)
+        except errors.OutOfRangeError:
+          logging.warning('Your dataset iterator ran out of data; '
+                          'interrupting training. Make sure that your dataset '
+                          'can generate at least `steps_per_epoch * epochs` '
+                          'batches (in this case, %d batches).' %
+                          steps_per_epoch * epochs)
+          break
 
         if not isinstance(outs, list):
           outs = [outs]
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py
index d9281436de..58011a1412 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py
@@ -23,6 +23,7 @@ import unittest
 
 import numpy as np
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
@@ -31,9 +32,9 @@ from tensorflow.python.keras._impl.keras.engine.training_utils import weighted_m
 from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
-
 try:
   import scipy.sparse as scipy_sparse  # pylint: disable=g-import-not-at-top
 except ImportError:
@@ -1711,14 +1712,77 @@ class TestTrainingWithDataTensors(test.TestCase):
                               'dropout_acc']
     self.assertEqual(reference_metric_names, model.metrics_names)
 
-if __name__ == '__main__':
-  # Bazel sets these environment variables to very long paths.
-  # Tempfile uses them to create long paths, and in turn multiprocessing
-  # library tries to create sockets named after paths. Delete whatever bazel
-  # writes to these to avoid tests failing due to socket addresses being too
-  # long.
-  for var in ('TMPDIR', 'TMP', 'TEMP'):
-    if var in os.environ:
-      del os.environ[var]
 
+class TestTrainingWithDatasetIterators(test.TestCase):
+
+  def test_training_and_eval_methods_on_iterators_single_io(self):
+    with self.test_session():
+      x = keras.layers.Input(shape=(3,), name='input')
+      y = keras.layers.Dense(4, name='dense')(x)
+      model = keras.Model(x, y)
+
+      optimizer = 'rmsprop'
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+      iterator = dataset.make_one_shot_iterator()
+
+      model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=0)
+      model.evaluate(iterator, steps=2, verbose=0)
+      model.predict(iterator, steps=2)
+      model.train_on_batch(iterator)
+      model.test_on_batch(iterator)
+      # Test with validation data
+      model.fit(iterator,
+                epochs=1, steps_per_epoch=2, verbose=0,
+                validation_data=iterator, validation_steps=2)
+      # Test with validation split
+      with self.assertRaisesRegexp(ValueError,
+                                   'you cannot use `validation_split`'):
+        model.fit(iterator,
+                  epochs=1, steps_per_epoch=2, verbose=0,
+                  validation_split=0.5, validation_steps=2)
+
+      # Test invalid usage
+      with self.assertRaisesRegexp(ValueError,
+                                   'Instead, pass an `Iterator`'):
+        model.fit(dataset,
+                  epochs=1, steps_per_epoch=2, verbose=0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'you should not specify a target'):
+        model.fit(iterator, iterator,
+                  epochs=1, steps_per_epoch=2, verbose=0)
+
+  def test_iterators_running_out_of_data(self):
+    with self.test_session():
+      x = keras.layers.Input(shape=(3,), name='input')
+      y = keras.layers.Dense(4, name='dense')(x)
+      model = keras.Model(x, y)
+
+      optimizer = 'rmsprop'
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(2)
+      dataset = dataset.batch(10)
+      iterator = dataset.make_one_shot_iterator()
+
+      with test.mock.patch.object(logging, 'warning') as mock_log:
+        model.fit(iterator, epochs=1, steps_per_epoch=3, verbose=0)
+        self.assertRegexpMatches(
+            str(mock_log.call_args),
+            'dataset iterator ran out of data')
+
+
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
index 7713d78b8a..cdf2da712f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
@@ -251,7 +251,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "to_json"
@@ -263,6 +263,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
index 69b81f75fa..5c2c29e60f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
@@ -268,7 +268,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "to_json"
@@ -280,6 +280,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
index 3ac285681f..b3f3f16922 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
@@ -251,7 +251,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "to_json"
@@ -263,6 +263,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
index 51ba0c5043..4ac6811bac 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
@@ -268,7 +268,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "to_json"
@@ -280,6 +280,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
 }
-- 
GitLab


From a186c4c093fce7e3fcc8cd59ca0e968324311f09 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 12:32:52 -0700
Subject: [PATCH 162/450] Fix bug in ring_reducer.cc abort handling.

PiperOrigin-RevId: 193557334
---
 .../core/common_runtime/ring_reducer.cc       | 20 ++++++++++---------
 .../core/common_runtime/ring_reducer_test.cc  | 12 +++++------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
index 79d03a24ce..a1cd762505 100644
--- a/tensorflow/core/common_runtime/ring_reducer.cc
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -426,17 +426,20 @@ bool RingReducer::RunAsyncParts() {
     // is done.
     bool dispatched = false;  // true if async action was initiated
     do {
-      if (aborted) break;
+      if (aborted) {
+        // Requeue this RingField to be counted off below.
+        ready_queue.Enqueue(rf);
+        break;
+      }
       switch (rf->action) {
         case RF_INIT:
           if (rf->do_recv) {
             rf->action = RF_RECV;
             auto requeue = [this, rf, &ready_queue, &aborted](Status s) {
-              if (!s.ok()) {
-                aborted = true;
-                StartAbort(s);
-              }
+              const bool bad_status = !s.ok();
+              if (bad_status) aborted = true;
               ready_queue.Enqueue(rf);
+              if (bad_status) StartAbort(s);
             };
             DispatchRecv(rf, requeue);
             dispatched = true;
@@ -481,11 +484,10 @@ bool RingReducer::RunAsyncParts() {
           if (rf->do_send) {
             rf->action = RF_SEND;
             auto send_complete = [this, rf, &ready_queue, &aborted](Status s) {
-              if (!s.ok()) {
-                aborted = true;
-                StartAbort(s);
-              }
+              const bool bad_status = !s.ok();
+              if (bad_status) aborted = true;
               ready_queue.Enqueue(rf);
+              if (bad_status) StartAbort(s);
             };
             DispatchSend(rf, send_complete);
             dispatched = true;
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index 57c36d6582..e4387a074a 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -572,9 +572,9 @@ DEF_TEST(INT32, CPU, 2, 8, 3, 4095, 0)
 DEF_TEST(INT64, CPU, 1, 2, 1, 1001, 0)
 DEF_TEST(INT64, CPU, 2, 8, 3, 4095, 0)
 
-// // Failure tests
-// DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 7)
-// DEF_TEST(FLOAT, CPU, 2, 8, 2, 9408, 11)
+// Failure tests
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 7)
+DEF_TEST(FLOAT, CPU, 2, 8, 2, 9408, 11)
 #endif
 
 #ifdef GOOGLE_CUDA
@@ -597,9 +597,9 @@ DEF_TEST(DOUBLE, GPU, 1, 2, 1, 1001, 0)
 // DEF_TEST(INT32, GPU, 1, 2, 1, 1001, 0)
 DEF_TEST(INT64, GPU, 1, 2, 1, 1001, 0)
 
-// // Failure tests
-// DEF_TEST(FLOAT, GPU, 1, 8, 1, 9408, 2)
-// DEF_TEST(FLOAT, GPU, 1, 8, 2, 9408, 5)
+// Failure tests
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 9408, 2)
+DEF_TEST(FLOAT, GPU, 1, 8, 2, 9408, 5)
 #endif
 
 }  // namespace
-- 
GitLab


From 46aec0d27f5d6fb3a0b81bc5a3384da11273dad6 Mon Sep 17 00:00:00 2001
From: Sung Jin Hwang <sjhwang@google.com>
Date: Thu, 19 Apr 2018 12:44:21 -0700
Subject: [PATCH 163/450] Make PmfToQuantizedCdf op to make adjustments if the
 sum of quantized pmf is less than 2**precision.

Prior to the change, the op did nothing when the sum of quantized pmf was less
than 2**precision. While the produced CDF was valid for range coders,
adjustments to CDF could be made to achieve better compression rate.

PiperOrigin-RevId: 193558740
---
 .../contrib/coder/kernels/pmf_to_cdf_op.cc    | 60 ++++++++++++++++---
 .../coder/kernels/pmf_to_cdf_op_test.cc       |  6 +-
 tensorflow/contrib/coder/ops/coder_ops.cc     | 16 +++--
 3 files changed, 64 insertions(+), 18 deletions(-)

diff --git a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc
index c787e8eded..bd5272ee6f 100644
--- a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc
+++ b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include <algorithm>
+#include <functional>
 #include <iterator>
 #include <numeric>
 #include <vector>
@@ -79,8 +80,8 @@ class PmfToCdfOp : public OpKernel {
   }
 
  private:
-  struct Item {
-    Item(int32* p, double mass) : pointer(p), mass(mass) {
+  struct PenaltyItem {
+    PenaltyItem(int32* p, double mass) : pointer(p), mass(mass) {
       penalty = ComputeNextPenalty();
     }
 
@@ -90,7 +91,7 @@ class PmfToCdfOp : public OpKernel {
       penalty = ComputeNextPenalty();
     }
 
-    friend bool operator<(const Item& lhs, const Item& rhs) {
+    friend bool operator<(const PenaltyItem& lhs, const PenaltyItem& rhs) {
       return lhs.penalty < rhs.penalty;
     }
 
@@ -106,6 +107,34 @@ class PmfToCdfOp : public OpKernel {
     double penalty;
   };
 
+  struct GainItem {
+    GainItem(int32* p, double mass) : pointer(p), mass(mass) {
+      gain = ComputeNextGain();
+    }
+
+    void Increase() {
+      CHECK_GT(*pointer, 0);
+      ++*pointer;
+      gain = ComputeNextGain();
+    }
+
+    friend bool operator>(const GainItem& lhs, const GainItem& rhs) {
+      return lhs.gain > rhs.gain;
+    }
+
+    double ComputeNextGain() {
+      // Never increment zero value to non-zero value.
+      if (*pointer < 1) {
+        return -std::numeric_limits<double>::infinity();
+      }
+      return mass * (std::log2(*pointer + 1) - std::log2(*pointer));
+    }
+
+    int32* pointer;
+    double mass;
+    double gain;
+  };
+
   void PerShard(gtl::ArraySlice<float> pmf,
                 gtl::MutableArraySlice<int32> cdf) const {
     CHECK_EQ(pmf.size(), cdf.size());
@@ -121,7 +150,7 @@ class PmfToCdfOp : public OpKernel {
 
     int32 sum = std::accumulate(cdf.begin(), cdf.end(), 0);
     if (sum > normalizer) {
-      std::vector<Item> queue;
+      std::vector<PenaltyItem> queue;
       queue.reserve(cdf.size());
       for (int i = 0; i < cdf.size(); ++i) {
         queue.emplace_back(&cdf[i], pmf[i]);
@@ -132,9 +161,26 @@ class PmfToCdfOp : public OpKernel {
         queue[0].Decrease();
         // Performs a linear search because this find_if is likely to return
         // iterator very close to the begin.
-        auto iter =
-            std::find_if(std::next(queue.begin()), queue.end(),
-                         [&queue](const Item& rhs) { return queue[0] < rhs; });
+        auto iter = std::find_if(
+            std::next(queue.begin()), queue.end(),
+            [&queue](const PenaltyItem& rhs) { return queue[0] < rhs; });
+        std::rotate(queue.begin(), std::next(queue.begin()), iter);
+      }
+    } else if (sum < normalizer) {
+      std::vector<GainItem> queue;
+      queue.reserve(cdf.size());
+      for (int i = 0; i < cdf.size(); ++i) {
+        queue.emplace_back(&cdf[i], pmf[i]);
+      }
+
+      std::sort(queue.begin(), queue.end(), std::greater<GainItem>());
+      while (sum++ < normalizer) {
+        queue[0].Increase();
+        // Performs a linear search because this find_if is likely to return
+        // iterator very close to the begin.
+        auto iter = std::find_if(
+            std::next(queue.begin()), queue.end(),
+            [&queue](const GainItem& rhs) { return queue[0] > rhs; });
         std::rotate(queue.begin(), std::next(queue.begin()), iter);
       }
     }
diff --git a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc
index c70e38faab..3408f6b519 100644
--- a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc
+++ b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc
@@ -82,7 +82,7 @@ class PmfToQuantizedCdfOpTest : public OpsTestBase {
         EXPECT_GT(diff, 0);
       }
 
-      EXPECT_LE(cdf_slice(cdf_slice.size() - 1), normalizer);
+      EXPECT_EQ(cdf_slice(cdf_slice.size() - 1), normalizer);
     }
   }
 };
@@ -98,6 +98,8 @@ TEST_F(PmfToQuantizedCdfOpTest, UnderSum) {
     GenerateData(&rand, {&matrix(i, 0), n});
   }
 
+  pmf.flat<float>() = pmf.flat<float>() * 0.85f;
+
   constexpr int kPrecision = 10;
   SetupOp(kPrecision, &pmf);
   TF_ASSERT_OK(RunOpKernel());
@@ -115,7 +117,7 @@ TEST_F(PmfToQuantizedCdfOpTest, OverSum) {
   matrix.setZero();
   const std::size_t n = matrix.dimension(1) / 2;
 
-  random::PhiloxRandom gen;
+  random::PhiloxRandom gen(random::New64(), random::New64());
   random::SimplePhilox rand(&gen);
   for (int64 i = 0; i < matrix.dimension(0); ++i) {
     GenerateData(&rand, {&matrix(i, 0), n});
diff --git a/tensorflow/contrib/coder/ops/coder_ops.cc b/tensorflow/contrib/coder/ops/coder_ops.cc
index 9bb171298f..a185e07913 100644
--- a/tensorflow/contrib/coder/ops/coder_ops.cc
+++ b/tensorflow/contrib/coder/ops/coder_ops.cc
@@ -77,7 +77,7 @@ are incorrect. For this reason, the range coder uses integer arithmetics and
 avoids using any floating point operations internally, and `cdf` should contain
 integers representing quantized probability mass rather than floating points. 
 
-data: An int32 tensor.
+data: An int16 tensor.
 cdf: An int32 tensor representing the CDF's of `data`. Each integer is divided
   by `2^precision` to represent a fraction.
 encoded: A range-coded scalar string.
@@ -112,7 +112,7 @@ potential performance issues, the decoder does not return error status.
 encoded: A scalar string tensor from RangeEncode.
 shape: An int32 1-D tensor representing the shape of the data encoded by
   RangeEncode.
-decoded: An int32 tensor with shape equal to `shape`.
+decoded: An int16 tensor with shape equal to `shape`.
 precision: The number of bits for probability quantization. Must be <= 16, and
   must match the precision used by RangeEncode that produced `encoded`.
 )doc");
@@ -138,14 +138,12 @@ platforms. For entropy encoders and decoders to have the same quantized CDF on
 different platforms, the quantized CDF should be produced once and saved, then
 the saved quantized CDF should be used everywhere.
 
-After quantization, if PMF sums to less than or equal to 2^precision, then this
-is equivalent to cumsum over the last dimension. This op makes no effort to make
-the sum close to 2^precision when the sum is already <= 2^precision.
+After quantization, if PMF does not sum to 2^precision, then some values of PMF
+are increased or decreased to adjust the sum to equal to 2^precision.
 
-After quantization, if PMF sums to greater than 2^precision, then some values of
-PMF is decreased to keep the sum no more than 2^precision.
-
-Note that the input PMF is pre-quantization.
+Note that the input PMF is pre-quantization. The input PMF is not normalized
+by this op prior to quantization. Therefore the user is responsible for
+normalizing PMF if necessary.
 )doc");
 // clang-format on
 }  // namespace tensorflow
-- 
GitLab


From b3118b1f741896585d47184018f1d74d70e0e6c7 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Thu, 19 Apr 2018 13:08:37 -0700
Subject: [PATCH 164/450] Update adam.py

---
 tensorflow/contrib/optimizer_v2/adam.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py
index 76a867039a..d538ad0fb0 100644
--- a/tensorflow/contrib/optimizer_v2/adam.py
+++ b/tensorflow/contrib/optimizer_v2/adam.py
@@ -40,19 +40,19 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):
 
     Initialization:
 
-    $$m_0 \Leftarrow 0 (Initialize initial 1st moment vector)$$
-    $$v_0 \Leftarrow 0 (Initialize initial 2nd moment vector)$$
-    $$t \Leftarrow 0 (Initialize timestep)$$
+    $$m_0 := 0 (Initialize initial 1st moment vector)$$
+    $$v_0 := 0 (Initialize initial 2nd moment vector)$$
+    $$t := 0 (Initialize timestep)$$
 
     The update rule for `variable` with gradient `g` uses an optimization
     described at the end of section2 of the paper:
 
-    $$t \Leftarrow t + 1$$
-    $$lr_t \Leftarrow \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+    $$t := t + 1$$
+    $$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
 
-    $$m_t \Leftarrow beta_1 * m_{t-1} + (1 - beta_1) * g$$
-    $$v_t \Leftarrow beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-    $$variable \Leftarrow variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+    $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+    $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+    $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 
     The default value of 1e-8 for epsilon might not be a good default in
     general. For example, when training an Inception network on ImageNet a
-- 
GitLab


From 58f6760373b7a2d71053bd17b8017e57e5d1195d Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Thu, 19 Apr 2018 13:09:24 -0700
Subject: [PATCH 165/450] Update api_def_ApplyAdam.pbtxt

---
 tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
index fca8ba2530..b90f5473c8 100644
--- a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
@@ -82,9 +82,9 @@ END
   }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: <<END
-$$lr_t \Leftarrow \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
-$$m_t \Leftarrow beta_1 * m_{t-1} + (1 - beta_1) * g$$
-$$v_t \Leftarrow beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-$$variable \Leftarrow variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+$$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+$$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 END
 }
-- 
GitLab


From 3c49ae705fc8dc65c34021bc616218e7bae5d625 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Thu, 19 Apr 2018 13:09:59 -0700
Subject: [PATCH 166/450] Update api_def_ResourceApplyAdam.pbtxt

---
 .../core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
index 8b16d824bf..743247bb60 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
@@ -76,8 +76,8 @@ END
   }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: <<END
-$$lr_t \Leftarrow \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
-$$m_t \Leftarrow beta_1 * m_{t-1} + (1 - beta_1) * g$$
-$$v_t \Leftarrow beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-$$variable \Leftarrow variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+$$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+$$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 }
-- 
GitLab


From 391626d76f6311219d4b78b5515934cbd0dd0c6d Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Thu, 19 Apr 2018 13:11:04 -0700
Subject: [PATCH 167/450] Update adam.py

---
 tensorflow/python/training/adam.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 9f523a3aca..6fa3ff6658 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -43,19 +43,19 @@ class AdamOptimizer(optimizer.Optimizer):
 
     Initialization:
 
-    $$m_0 \Leftarrow 0 (Initialize initial 1st moment vector)$$
-    $$v_0 \Leftarrow 0 (Initialize initial 2nd moment vector)$$
-    $$t \Leftarrow 0 (Initialize timestep)$$
+    $$m_0 := 0 (Initialize initial 1st moment vector)$$
+    $$v_0 := 0 (Initialize initial 2nd moment vector)$$
+    $$t := 0 (Initialize timestep)$$
 
     The update rule for `variable` with gradient `g` uses an optimization
     described at the end of section2 of the paper:
 
-    $$t \Leftarrow t + 1$$
-    $$lr_t \Leftarrow \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+    $$t := t + 1$$
+    $$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
 
-    $$m_t \Leftarrow beta_1 * m_{t-1} + (1 - beta_1) * g$$
-    $$v_t \Leftarrow beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-    $$variable \Leftarrow variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+    $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+    $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+    $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 
     The default value of 1e-8 for epsilon might not be a good default in
     general. For example, when training an Inception network on ImageNet a
-- 
GitLab


From b6686d2808b40ed985db2151bcf31961b53e49f5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 13:09:07 -0700
Subject: [PATCH 168/450] Collective Ops Part 4

Add Broadcaster.
A few minor adjustments to CollectiveParams and RMA.

This change is part of a series of changes introducing infrastructure
for collective ops and initial implementations of reduction and broadcast.

PiperOrigin-RevId: 193562391
---
 tensorflow/core/BUILD                         |  30 +
 .../base_collective_executor.cc               |  81 +-
 .../common_runtime/base_collective_executor.h |   7 +
 tensorflow/core/common_runtime/broadcaster.cc | 249 ++++++
 tensorflow/core/common_runtime/broadcaster.h  |  66 ++
 .../core/common_runtime/broadcaster_test.cc   | 741 ++++++++++++++++++
 .../collective_param_resolver_local.cc        |  42 +-
 .../collective_param_resolver_local_test.cc   |   8 +-
 .../common_runtime/collective_rma_local.h     |   2 +
 tensorflow/core/framework/collective.cc       |  15 +-
 tensorflow/core/framework/collective.h        |   7 +-
 11 files changed, 1220 insertions(+), 28 deletions(-)
 create mode 100644 tensorflow/core/common_runtime/broadcaster.cc
 create mode 100644 tensorflow/core/common_runtime/broadcaster.h
 create mode 100644 tensorflow/core/common_runtime/broadcaster_test.cc

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 54e7ab31d7..c15e7de186 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2256,6 +2256,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/allocator_retry.h",
     "common_runtime/base_collective_executor.h",
     "common_runtime/bfc_allocator.h",
+    "common_runtime/broadcaster.h",
     "common_runtime/buf_rendezvous.h",
     "common_runtime/build_graph_options.h",
     "common_runtime/collective_executor_mgr.h",
@@ -2303,6 +2304,7 @@ tf_cuda_library(
         "common_runtime/allocator_retry.cc",
         "common_runtime/base_collective_executor.cc",
         "common_runtime/bfc_allocator.cc",
+        "common_runtime/broadcaster.cc",
         "common_runtime/buf_rendezvous.cc",
         "common_runtime/build_graph_options.cc",
         "common_runtime/collective_executor_mgr.cc",
@@ -3140,6 +3142,34 @@ tf_cc_tests_gpu(
     ],
 )
 
+tf_cc_tests_gpu(
+    name = "broadcaster_test",
+    size = "small",
+    srcs = [
+        "common_runtime/broadcaster_test.cc",
+    ],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":all_kernels",
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        ":framework",
+        ":framework_internal",
+        ":gpu_runtime",
+        ":lib",
+        ":lib_internal",
+        ":ops",
+        ":protos_all_cc",
+        ":protos_test_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+    ],
+)
+
 tf_cc_test_mkl(
     name = "mkl_runtime_tests",
     size = "small",
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index f6332fabdb..637b43c844 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -14,14 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/base_collective_executor.h"
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/common_runtime/broadcaster.h"
 #include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/ring_reducer.h"
 #include "tensorflow/core/lib/core/notification.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 
 #define VALUE_IN_DEBUG_STRING false
 
@@ -194,37 +193,68 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
                                           const CollectiveParams& col_params,
                                           const string& exec_key,
                                           StatusCallback done) {
-  const Tensor* input = &ctx->input(0);
+  // On any individual collective Op failure we need to abort the
+  // BufRendezvous so that other Ops in the instance don't hang
+  // waiting for transmissions that will never happen.  Do so after a
+  // delay so that the original error status is more likely to
+  // propagate up, and peers are unlikely to re-create the purged
+  // BufRendezvous by late-arriving requests.
+  StatusCallback done_safe = [this, done](const Status& s) {
+    if (!s.ok()) {
+      Ref();  // Ensure this lasts until the closure executes.
+      SchedNonBlockingClosureAfter(1000000, [this, s] {
+        remote_access_->buf_rendezvous()->StartAbort(s);
+        Unref();
+      });
+    }
+    done(s);
+  };
+
   Tensor* output = ctx->mutable_output(0);
   string error;
   switch (col_params.instance.type) {
     case REDUCTION_COLLECTIVE: {
       // TODO(tucker): support other reduction algorithms,
       // e.g. tree-reduce, hybrid tree/ring, delegate-to-NCCL, etc.
+      const Tensor* input = &ctx->input(0);
       RingReducer* reducer =
           CreateReducer(ctx, CtxParams(ctx), col_params, exec_key, step_id_,
                         input, output, &error);
       if (!reducer) {
-        done(errors::Internal(error));
+        done_safe(errors::Internal(error));
         return;
       }
       // Run in an I/O thread, so as not to starve the executor threads.
       // TODO(tucker): Instead of forking every per-device Collective
       // Op off into its own thread, consider queuing them on a
       // fixed-size thread-pool dedicated to running CollectiveOps.
-      SchedClosure([reducer, done]() {
-        reducer->Run([reducer, done](const Status& s) {
-          done(s);
+      SchedClosure([reducer, done_safe]() {
+        reducer->Run([reducer, done_safe](const Status& s) {
+          done_safe(s);
           delete reducer;
         });
       });
     } break;
-    case BROADCAST_COLLECTIVE:
-      done(errors::Internal("Collective Broadcast unimplemented"));
-      break;
+
+    case BROADCAST_COLLECTIVE: {
+      Broadcaster* broadcaster = CreateBroadcaster(
+          ctx, CtxParams(ctx), col_params, exec_key, step_id_, output, &error);
+      if (!broadcaster) {
+        done_safe(errors::Internal(error));
+        return;
+      }
+      // Run in an I/O thread, so as not to starve the executor threads.
+      SchedClosure([broadcaster, done_safe]() {
+        broadcaster->Run([broadcaster, done_safe](const Status& s) {
+          done_safe(s);
+          delete broadcaster;
+        });
+      });
+    } break;
+
     default:
-      done(errors::Internal("Unimplemented CollectiveType ",
-                            col_params.instance.type));
+      done_safe(errors::Internal("Unimplemented CollectiveType ",
+                                 col_params.instance.type));
   }
 }
 
@@ -254,4 +284,31 @@ RingReducer* BaseCollectiveExecutor::CreateReducer(
   }
 }
 
+Broadcaster* BaseCollectiveExecutor::CreateBroadcaster(
+    OpKernelContext* ctx, OpKernelContext::Params* params,
+    const CollectiveParams& col_params, const string& exec_key, int64 step_id,
+    Tensor* output, string* error) {
+  switch (col_params.instance.data_type) {
+    case DT_INT32:
+      if (col_params.group.device_type == DEVICE_GPU) {
+        *error =
+            "Collective Broadcast does not support datatype DT_INT32 on "
+            "DEVICE_GPU";
+        return nullptr;
+      }
+      TF_FALLTHROUGH_INTENDED;
+    case DT_FLOAT:
+    case DT_DOUBLE:
+    case DT_INT64: {
+      return new Broadcaster(this, dev_mgr_, ctx, params, col_params, exec_key,
+                             step_id, output);
+    } break;
+    default:
+      *error =
+          strings::StrCat("Collective Broadcast does not support datatype ",
+                          DataTypeString(col_params.instance.data_type));
+      return nullptr;
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/base_collective_executor.h b/tensorflow/core/common_runtime/base_collective_executor.h
index 58eaf31f71..462d6b7533 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.h
+++ b/tensorflow/core/common_runtime/base_collective_executor.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/device_attributes.pb.h"
 
 namespace tensorflow {
+class Broadcaster;
 class DeviceMgr;
 class RingReducer;
 
@@ -138,6 +139,12 @@ class BaseCollectiveExecutor : public CollectiveExecutor {
                              const string& exec_key, int64 step_id,
                              const Tensor* input, Tensor* output,
                              string* error);
+
+  Broadcaster* CreateBroadcaster(OpKernelContext* ctx,
+                                 OpKernelContext::Params* params,
+                                 const CollectiveParams& col_params,
+                                 const string& exec_key, int64 step_id,
+                                 Tensor* output, string* error);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/broadcaster.cc b/tensorflow/core/common_runtime/broadcaster.cc
new file mode 100644
index 0000000000..5e8af8653d
--- /dev/null
+++ b/tensorflow/core/common_runtime/broadcaster.cc
@@ -0,0 +1,249 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/broadcaster.h"
+
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/platform/env.h"
+
+// Set true for greater intelligibility of debug mode log messages.
+#define READABLE_KEYS false
+
+namespace tensorflow {
+
+namespace {
+// Key to be used for BufRendezvous by Broadcaster.
+string BroadcastBufKey(const string& exec_key, int src_rank, int dst_rank) {
+  if (READABLE_KEYS) {
+    return strings::StrCat("broadcast(", exec_key, "):src(", src_rank, "):dst(",
+                           dst_rank, ")");
+  } else {
+    // TODO(tucker): Try a denser format, e.g. a 64 or 128 bit hash.
+    return strings::StrCat(exec_key, ":", src_rank, ":", dst_rank);
+  }
+}
+}  // namespace
+
+Broadcaster::Broadcaster(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr,
+                         OpKernelContext* ctx, OpKernelContext::Params* params,
+                         const CollectiveParams& col_params,
+                         const string& exec_key, int64 step_id, Tensor* output)
+    : col_exec_(col_exec),
+      dev_mgr_(dev_mgr),
+      ctx_(ctx),
+      col_params_(col_params),
+      exec_key_(exec_key),
+      rank_(col_params.subdiv_rank[0]),
+      is_source_(col_params.is_source),
+      output_(output),
+      done_(nullptr),
+      device_(nullptr) {}
+
+void Broadcaster::Run(StatusCallback done) {
+  // The optimal data transfer choreography is going to very platform dependent.
+  // That will be addressed by later improvements here or by platform-specific
+  // overrides of collective broadcast. The initial version is simply
+  // a binary tree that completely ignores DeviceLocality.
+  done_ = std::move(done);
+
+  // Get the device for which we're executing and look up its locality.
+  status_ = dev_mgr_->LookupDevice(
+      col_params_.instance.device_names[col_params_.default_rank], &device_);
+  if (!status_.ok()) {
+    done_(status_);
+    return;
+  }
+  CHECK(device_);
+  device_locality_ = device_->attributes().locality();
+
+  RunTree();
+}
+
+// Binary tree parent/child relations are trivial to calculate, i.e.
+// device at rank r is the parent of 2r+1 and 2r+2.  The one exception
+// is if the source is not rank 0.  We treat that case as though the
+// source is appended to the front of the rank ordering as well as
+// continuing to occupy its current position.  Hence we calculate as
+// though each device's rank is actually r+1, then subtract 1 again to
+// get the descendent ranks.  If the source is not rank 0 then its
+// decendents include both {0,1} and the descendents of its current
+// position.  Where a non-0-rank source is a descendent of another
+// device, no send to it is necessary.
+
+/* static*/
+int Broadcaster::TreeRecvFrom(const CollectiveParams& cp) {
+  DCHECK_EQ(1, cp.subdiv_rank.size());
+  if (cp.is_source) return -1;
+  int source_rank = cp.instance.impl_details.subdiv_source_rank[0];
+  int my_rank = cp.subdiv_rank[0];
+  if (source_rank == 0) {
+    return (my_rank - 1) / 2;
+  } else {
+    int predecessor_rank = (my_rank / 2) - 1;
+    return (predecessor_rank < 0) ? source_rank : predecessor_rank;
+  }
+}
+
+/* static */
+void Broadcaster::TreeSendTo(const CollectiveParams& cp,
+                             std::vector<int>* targets) {
+  DCHECK_EQ(1, cp.subdiv_rank.size());
+  targets->clear();
+  int my_rank = cp.subdiv_rank[0];
+  DCHECK_EQ(1, cp.instance.impl_details.subdiv_source_rank.size());
+  int source_rank = cp.instance.impl_details.subdiv_source_rank[0];
+  int successor_rank = 0;
+  if (source_rank == 0) {
+    successor_rank = (2 * my_rank) + 1;
+  } else {
+    successor_rank = (2 * (my_rank + 1));
+  }
+  DCHECK_NE(successor_rank, my_rank);
+  if (cp.is_source && source_rank != 0) {
+    // The source sends to rank 0,1 in addition to its positional
+    // decendents.
+    if (cp.group.group_size > 1) {
+      targets->push_back(0);
+    }
+    if (cp.group.group_size > 2 && source_rank != 1) {
+      targets->push_back(1);
+    }
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (successor_rank < cp.group.group_size && successor_rank != source_rank) {
+      targets->push_back(successor_rank);
+    }
+    ++successor_rank;
+  }
+}
+
+// Execute a tree broadcast, i.e. each non-source device receives from
+// one other and sends to up-to two others.
+void Broadcaster::RunTree() {
+  mutex mu;
+  int pending_count = 0;  // GUARDED_BY(mu)
+  condition_variable all_done;
+  std::vector<int> send_to_ranks;
+  TreeSendTo(col_params_, &send_to_ranks);
+
+  if (!is_source_) {
+    // Begin by receiving the value.
+    int recv_from_rank = TreeRecvFrom(col_params_);
+    Notification note;
+    DispatchRecv(recv_from_rank, output_,
+                 [this, recv_from_rank, &mu, &note](const Status& s) {
+                   mutex_lock l(mu);
+                   status_.Update(s);
+                   note.Notify();
+                 });
+    note.WaitForNotification();
+  }
+
+  // Then forward value to all descendent devices.
+  if (status_.ok()) {
+    for (int i = 0; i < send_to_ranks.size(); ++i) {
+      int target_rank = send_to_ranks[i];
+      {
+        mutex_lock l(mu);
+        ++pending_count;
+      }
+      DispatchSend(
+          target_rank, output_,
+          [this, target_rank, &mu, &pending_count, &all_done](const Status& s) {
+            status_.Update(s);
+            {
+              mutex_lock l(mu);
+              --pending_count;
+              if (pending_count == 0) {
+                all_done.notify_all();
+              }
+            }
+          });
+    }
+  }
+
+  if (status_.ok() && is_source_) {
+    // Meanwhile, copy input to output if we weren't lucky enough to
+    // be able to reuse input as output.
+    const Tensor* input = &ctx_->input(0);
+    if (input != output_ &&
+        (DMAHelper::base(input) != DMAHelper::base(output_))) {
+      {
+        mutex_lock l(mu);
+        ++pending_count;
+      }
+      DeviceContext* op_dev_ctx = ctx_->op_device_context();
+      CollectiveRemoteAccessLocal::MemCpyAsync(
+          op_dev_ctx, op_dev_ctx, device_, device_, ctx_->input_alloc_attr(0),
+          ctx_->output_alloc_attr(0), input, output_,
+          [this, &mu, &pending_count, &all_done](const Status& s) {
+            status_.Update(s);
+            {
+              mutex_lock l(mu);
+              --pending_count;
+              if (0 == pending_count) {
+                all_done.notify_all();
+              }
+            }
+          });
+    }
+  }
+
+  // Then wait for all pending actions to complete.
+  {
+    mutex_lock l(mu);
+    if (pending_count > 0) {
+      all_done.wait(l);
+    }
+  }
+
+  VLOG(2) << "return status " << status_;
+  done_(status_);
+}
+
+void Broadcaster::DispatchSend(int dst_rank, const Tensor* src_tensor,
+                               const StatusCallback& done) {
+  string send_buf_key = BroadcastBufKey(exec_key_, rank_, dst_rank);
+  VLOG(1) << "DispatchSend " << send_buf_key << " from_device "
+          << device_->name();
+  int dst_idx =
+      col_params_.instance.impl_details.subdiv_permutations[0][dst_rank];
+  col_exec_->PostToPeer(col_params_.instance.device_names[dst_idx],
+                        col_params_.instance.task_names[dst_idx], send_buf_key,
+                        device_, ctx_->op_device_context(),
+                        ctx_->output_alloc_attr(0), src_tensor,
+                        device_locality_, done);
+}
+
+void Broadcaster::DispatchRecv(int src_rank, Tensor* dst_tensor,
+                               const StatusCallback& done) {
+  string recv_buf_key = BroadcastBufKey(exec_key_, src_rank, rank_);
+  int src_idx =
+      col_params_.instance.impl_details.subdiv_permutations[0][src_rank];
+  VLOG(1) << "DispatchRecv " << recv_buf_key << " from_device "
+          << col_params_.instance.device_names[src_idx];
+  int dst_idx = col_params_.instance.impl_details.subdiv_permutations[0][rank_];
+  CHECK_EQ(col_params_.instance.device_names[dst_idx], device_->name());
+  col_exec_->RecvFromPeer(col_params_.instance.device_names[src_idx],
+                          col_params_.instance.task_names[src_idx],
+                          col_params_.task.is_local[src_idx], recv_buf_key,
+                          device_, ctx_->op_device_context(),
+                          ctx_->output_alloc_attr(0), dst_tensor,
+                          device_locality_, done);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/broadcaster.h b/tensorflow/core/common_runtime/broadcaster.h
new file mode 100644
index 0000000000..bdf68f19ab
--- /dev/null
+++ b/tensorflow/core/common_runtime/broadcaster.h
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_BROADCASTER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_BROADCASTER_H_
+
+#include <vector>
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+
+namespace tensorflow {
+
+// Tree-algorithm implementation of collective broadcast.
+class Broadcaster {
+ public:
+  Broadcaster(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr,
+              OpKernelContext* ctx, OpKernelContext::Params* params,
+              const CollectiveParams& col_params, const string& exec_key,
+              int64 step_id, Tensor* output);
+
+  void Run(StatusCallback done);
+
+  // Returns the rank of the device from which this device should receive
+  // its value, -1 if no value should be received.
+  static int TreeRecvFrom(const CollectiveParams& cp);
+
+  // Populates targets with the ranks of the devices to which this device
+  // should forward the value.
+  static void TreeSendTo(const CollectiveParams& cp, std::vector<int>* targets);
+
+ private:
+  void DispatchSend(int dst_rank, const Tensor* src_tensor,
+                    const StatusCallback& done);
+  void DispatchRecv(int src_rank, Tensor* dst_tensor,
+                    const StatusCallback& done);
+  void RunTree();
+
+  Status status_;
+  CollectiveExecutor* col_exec_;  // Not owned
+  const DeviceMgr* dev_mgr_;      // Not owned
+  OpKernelContext* ctx_;          // Not owned
+  const CollectiveParams& col_params_;
+  const string exec_key_;
+  const int rank_;
+  const bool is_source_;
+  Tensor* output_;  // Not owned
+  std::unique_ptr<CollectiveAdapter> ca_;
+  StatusCallback done_;
+  Device* device_;  // The device for which this instance labors
+  DeviceLocality device_locality_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_BROADCASTER_H_
diff --git a/tensorflow/core/common_runtime/broadcaster_test.cc b/tensorflow/core/common_runtime/broadcaster_test.cc
new file mode 100644
index 0000000000..89d39144b3
--- /dev/null
+++ b/tensorflow/core/common_runtime/broadcaster_test.cc
@@ -0,0 +1,741 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/broadcaster.h"
+
+#include <algorithm>
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+namespace {
+
+static int64 kStepId = 123;
+static int32 kNumSubdivs = 1;  // Subdiv not yet meaningful for broadcast
+
+// The test harness won't allow a mixture of fixture and non-fixture
+// tests in one file, so this is a trival fixture for tests that don't
+// need the heavy-weight BroadcasterTest fixture.
+class TrivialTest : public ::testing::Test {
+ protected:
+  TrivialTest() {}
+};
+
+// Tests of static TreeSendTo() and TreeRecvFrom() functions.
+// D = number of devices
+// S = source rank
+// R = tested rank
+// RF = receive-from rank
+// ST = send_to rank vector
+#define DEF_TL_TEST(D, S, R, RF, ST)                               \
+  TEST_F(TrivialTest, TreeLinks_##D##Devs_##S##Source_##R##Rank) { \
+    CollectiveParams cp;                                           \
+    cp.group.group_size = D;                                       \
+    cp.instance.impl_details.subdiv_source_rank = {S};             \
+    cp.subdiv_rank = {R};                                          \
+    cp.is_source = (S == R);                                       \
+    EXPECT_EQ(RF, Broadcaster::TreeRecvFrom(cp));                  \
+    std::vector<int> expected = ST;                                \
+    std::vector<int> send_to;                                      \
+    Broadcaster::TreeSendTo(cp, &send_to);                         \
+    ASSERT_EQ(expected.size(), send_to.size());                    \
+    for (int i = 0; i < expected.size(); ++i) {                    \
+      EXPECT_EQ(expected[i], send_to[i]);                          \
+    }                                                              \
+  }
+
+#define V(...) std::vector<int>({__VA_ARGS__})
+
+//          D  S  R  RF  ST
+// 2 device cases
+DEF_TL_TEST(2, 0, 0, -1, V(1))
+DEF_TL_TEST(2, 1, 0, 1, V())
+DEF_TL_TEST(2, 0, 1, 0, V())
+DEF_TL_TEST(2, 1, 1, -1, V(0))
+// 3 device cases
+DEF_TL_TEST(3, 0, 0, -1, V(1, 2))
+DEF_TL_TEST(3, 0, 1, 0, V())
+DEF_TL_TEST(3, 0, 2, 0, V())
+DEF_TL_TEST(3, 1, 0, 1, V(2))
+DEF_TL_TEST(3, 1, 1, -1, V(0))
+DEF_TL_TEST(3, 1, 2, 0, V())
+DEF_TL_TEST(3, 2, 0, 2, V())
+DEF_TL_TEST(3, 2, 1, 2, V())
+DEF_TL_TEST(3, 2, 2, -1, V(0, 1))
+// 4 device cases
+DEF_TL_TEST(4, 0, 0, -1, V(1, 2))
+DEF_TL_TEST(4, 0, 1, 0, V(3))
+DEF_TL_TEST(4, 0, 2, 0, V())
+DEF_TL_TEST(4, 0, 3, 1, V())
+DEF_TL_TEST(4, 1, 0, 1, V(2, 3))
+DEF_TL_TEST(4, 1, 1, -1, V(0))
+DEF_TL_TEST(4, 1, 2, 0, V())
+DEF_TL_TEST(4, 1, 3, 0, V())
+DEF_TL_TEST(4, 2, 0, 2, V(3))
+DEF_TL_TEST(4, 2, 1, 2, V())
+DEF_TL_TEST(4, 2, 2, -1, V(0, 1))
+DEF_TL_TEST(4, 2, 3, 0, V())
+DEF_TL_TEST(4, 3, 0, 3, V(2))
+DEF_TL_TEST(4, 3, 1, 3, V())
+DEF_TL_TEST(4, 3, 2, 0, V())
+DEF_TL_TEST(4, 3, 3, -1, V(0, 1))
+// 8 device cases
+//          D  S  R  RF  ST
+DEF_TL_TEST(8, 0, 0, -1, V(1, 2))
+DEF_TL_TEST(8, 0, 1, 0, V(3, 4))
+DEF_TL_TEST(8, 0, 2, 0, V(5, 6))
+DEF_TL_TEST(8, 0, 3, 1, V(7))
+DEF_TL_TEST(8, 0, 4, 1, V())
+DEF_TL_TEST(8, 0, 5, 2, V())
+DEF_TL_TEST(8, 0, 6, 2, V())
+DEF_TL_TEST(8, 0, 7, 3, V())
+DEF_TL_TEST(8, 7, 0, 7, V(2, 3))
+DEF_TL_TEST(8, 7, 1, 7, V(4, 5))
+DEF_TL_TEST(8, 7, 2, 0, V(6))
+DEF_TL_TEST(8, 7, 3, 0, V())
+DEF_TL_TEST(8, 7, 4, 1, V())
+DEF_TL_TEST(8, 7, 5, 1, V())
+DEF_TL_TEST(8, 7, 6, 2, V())
+DEF_TL_TEST(8, 7, 7, -1, V(0, 1))
+#undef DEF_TL_TEST
+#undef V
+
+// Wraps CollectiveRemoteAccessLocal with the ability to return an
+// error status to the N'th action.
+// TODO(tucker): factor out of this file and ring_reducer_test.cc
+// into a single common source.
+class FailTestRMA : public CollectiveRemoteAccessLocal {
+ public:
+  FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
+              int64 step_id, int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+        fail_after_(fail_after) {}
+
+  bool MaybeFail(const StatusCallback& done) {
+    bool fail_now = false;
+    {
+      mutex_lock l(mu_);
+      if (fail_after_ > 0) {
+        fail_now = (--fail_after_ == 0);
+      }
+    }
+    if (fail_now) {
+      auto error = errors::Internal("Deliberate failure");
+      LOG(INFO) << "triggering failure " << error;
+      SchedNonBlockingClosureAfter(
+          1000, [this, error] { buf_rendezvous()->StartAbort(error); });
+      done(error);
+      return true;
+    }
+    return false;
+  }
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::RecvFromPeer(
+        peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
+        to_alloc_attr, to_tensor, client_locality, done);
+  }
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::PostToPeer(
+        peer_device, peer_task, key, from_device, from_device_ctx,
+        from_alloc_attr, from_tensor, client_locality, done);
+  }
+
+  mutex mu_;
+  int fail_after_ GUARDED_BY(mu_);
+};
+
+class BroadcasterTest : public ::testing::Test {
+ protected:
+  BroadcasterTest() : device_type_(DEVICE_CPU) {}
+
+  ~BroadcasterTest() override {
+    stop_ = true;
+    for (auto i : instances_) {
+      delete i;
+    }
+    if (col_exec_) col_exec_->Unref();
+  }
+
+  void SetUp() override {
+#if GOOGLE_CUDA
+    auto device_factory = DeviceFactory::GetFactory("GPU");
+    CHECK(device_factory);
+    SessionOptions options;
+    Status s = device_factory->CreateDevices(
+        options, "/job:worker/replica:0/task:0", &gpu_devices_);
+    CHECK(s.ok());
+#endif
+  }
+
+  void Init(int num_workers, int num_devices, DataType dtype,
+            const DeviceType& device_type, int fail_after) {
+    device_type_ = device_type;
+    std::vector<Device*> local_devices;
+    SessionOptions sess_opts;
+    sess_opts.env = Env::Default();
+    Bytes mem_limit(4 << 20);
+    DeviceLocality dev_locality;
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        if (device_type == DEVICE_CPU) {
+          string dev_name = strings::StrCat("/job:worker/replica:0/task:", wi,
+                                            "/device:CPU:", di);
+          local_devices.push_back(new ThreadPoolDevice(
+              sess_opts, dev_name, mem_limit, dev_locality, cpu_allocator()));
+        } else if (device_type == DEVICE_GPU && !gpu_devices_.empty()) {
+          int dev_idx = (wi * num_devices) + di;
+          if (dev_idx >= static_cast<int>(gpu_devices_.size())) {
+            LOG(INFO) << "dev_mgr has access to limited GPUs, reusing for more "
+                         "than one ring node.";
+          } else {
+            local_devices.push_back(gpu_devices_[dev_idx]);
+          }
+        } else {
+          LOG(FATAL) << "Unsupported device_type " << device_type;
+        }
+      }
+    }
+    if (!dev_mgr_ || device_type == DEVICE_CPU) {
+      dev_mgr_.reset(new DeviceMgr(local_devices));
+    }
+    dev_resolver_.reset(new DeviceResolverLocal(dev_mgr_.get()));
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
+                           fail_after);
+    col_exec_ = new BaseCollectiveExecutor(&col_exec_mgr_, rma_, kStepId,
+                                           dev_mgr_.get());
+    col_params_.name = "test_collective";
+    col_params_.instance.data_type = dtype;
+    static const int kGroupKey = 5;
+    col_params_.group.group_key = kGroupKey;
+    static const int kInstanceKey = 17;
+    col_params_.instance.instance_key = kInstanceKey;
+    col_params_.group.device_type = device_type;
+    col_params_.group.group_size = num_workers * num_devices;
+    col_params_.instance.impl_details.subdiv_offsets.clear();
+    col_params_.instance.type = BROADCAST_COLLECTIVE;
+    col_params_.instance.impl_details.subdiv_permutations.resize(kNumSubdivs);
+    col_params_.subdiv_rank.resize(kNumSubdivs);
+    int subdiv_stride = num_devices / kNumSubdivs;
+    for (int sdi = 0; sdi < kNumSubdivs; ++sdi) {
+      col_params_.instance.impl_details.subdiv_offsets.push_back(sdi *
+                                                                 subdiv_stride);
+      col_params_.subdiv_rank[sdi] = sdi * subdiv_stride;
+    }
+
+    // Set up a local device ring order that's not just 0,1,2...
+    std::vector<int> local_ring_order;
+    for (int di = 0; di < num_devices; ++di) {
+      local_ring_order.push_back(di);
+    }
+    for (int di = 0; di < num_devices; ++di) {
+      bool is_odd = ((di % 2) == 1);
+      int other = (di + (is_odd ? 7 : 3)) % num_devices;
+      if (di == other) continue;
+      iter_swap(local_ring_order.begin() + di,
+                local_ring_order.begin() + other);
+    }
+    broadcast_dev_id_ = local_ring_order[0];
+    string lro_buf;
+    for (auto d : local_ring_order) strings::StrAppend(&lro_buf, d, ", ");
+    VLOG(1) << "local_ring_order " << lro_buf;
+
+    // Set up all of the fake device contexts.
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
+        string dev_name = strings::StrCat(task_name, "/device:CPU:", di);
+        if (device_type == DEVICE_GPU) {
+          dev_name = strings::StrCat(task_name, "/device:GPU:0");
+        }
+        col_params_.instance.device_names.push_back(dev_name);
+        col_params_.instance.task_names.push_back(task_name);
+        // Normally each device would set is_local to its own perspective but
+        // this test runs in a single process so is_local is always true.
+        col_params_.task.is_local.push_back(true);
+        for (int sdi = 0; sdi < kNumSubdivs; ++sdi) {
+          int rotated_di =
+              (di + col_params_.instance.impl_details.subdiv_offsets[sdi]) %
+              num_devices;
+          col_params_.instance.impl_details.subdiv_permutations[sdi].push_back(
+              wi * num_devices + local_ring_order[rotated_di]);
+        }
+      }
+    }
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        int rank = wi * num_devices + di;
+        instances_.push_back(new DeviceInstance(
+            rank, col_params_.instance.device_names[rank], device_type_, this));
+      }
+    }
+  }
+
+  typedef std::function<void(Tensor*)> InitFunc;
+
+  void Broadcast() {
+    std::atomic<int> done(0);
+    for (auto di : instances_) {
+      SchedClosure([di, &done] {
+        di->DoBroadcast();
+        ++done;
+      });
+    }
+    while (done < instances_.size()) {
+      if (stop_) break;
+      Env::Default()->SleepForMicroseconds(1000);
+    }
+  }
+
+  std::unique_ptr<OpKernel> GetKernel(const NodeDef& node,
+                                      const DeviceType& device_type,
+                                      DeviceBase* device) {
+    Status status;
+    std::unique_ptr<OpKernel> k = CreateOpKernel(
+        device_type, device, device->GetAllocator(AllocatorAttributes()), node,
+        TF_GRAPH_DEF_VERSION, &status);
+    if (!status.ok()) {
+      LOG(FATAL) << status;
+    }
+    return k;
+  }
+
+  std::unique_ptr<OpKernel> GetCollectiveBcastSend(
+      const CollectiveParams& params, Tensor* input,
+      const DeviceType& device_type, DeviceBase* device) {
+    mutex_lock l(mu_);
+    NodeDef node_def;
+    NodeDefBuilder builder(
+        strings::StrCat("collective_bcast_send_", bcast_send_counter_++),
+        "CollectiveBcastSend");
+    TF_CHECK_OK(builder.Attr("T", input->dtype())
+                    .Attr("group_size", params.group.group_size)
+                    .Attr("group_key", params.group.group_key)
+                    .Attr("instance_key", params.instance.instance_key)
+                    .Attr("shape", input->shape())
+                    .Input(FakeInput(params.instance.data_type))
+                    .Finalize(&node_def));
+    return GetKernel(node_def, device_type, device);
+  }
+
+  std::unique_ptr<OpKernel> GetCollectiveBcastRecv(
+      const CollectiveParams& params, const TensorShape& shape,
+      const DeviceType& device_type, DeviceBase* device) {
+    mutex_lock l(mu_);
+    NodeDef node_def;
+    NodeDefBuilder builder(
+        strings::StrCat("collective_bcast_recv_", bcast_recv_counter_++),
+        "CollectiveBcastRecv");
+    TF_CHECK_OK(builder.Attr("T", params.instance.data_type)
+                    .Attr("group_size", params.group.group_size)
+                    .Attr("group_key", params.group.group_key)
+                    .Attr("instance_key", params.instance.instance_key)
+                    .Attr("shape", shape)
+                    .Finalize(&node_def));
+    return GetKernel(node_def, device_type, device);
+  }
+
+  void BuildColParams() {}
+
+  template <typename T>
+  void RunTest(DataType dtype, const DeviceType& device_type, int num_workers,
+               int num_devices, int tensor_len, int fail_after) {
+    Init(num_workers, num_devices, dtype, device_type, fail_after);
+
+    // Initialize each instance tensor with distinct values.
+    for (int di = 0; di < instances_.size(); ++di) {
+      DeviceInstance* instance = instances_[di];
+      instance->InitTensor(
+          dtype, TensorShape({tensor_len}), [di, dtype](Tensor* t) {
+            for (size_t i = 0; i < t->NumElements(); ++i) {
+              // The cast is necessary to prevent clang-tidy from insisting
+              // that a faster non-open source function be substituted.
+              float value = pow(10, static_cast<double>(di)) * i;
+              t->flat<T>()(i) = value;
+            }
+          });
+    }
+
+    // Copy the expected value from the broadcast source tensor
+    std::vector<T> expected(tensor_len, 0.0);
+    const CollectiveParams& cp = instances_[0]->col_params_;
+    int broadcast_dev_id =
+        cp.instance.impl_details.subdiv_permutations
+            [0][cp.instance.impl_details.subdiv_source_rank[0]];
+    const Tensor* t = &instances_[broadcast_dev_id]->tensor_;
+    Tensor cpu_copy(dtype, TensorShape({tensor_len}));
+    if (device_type == DEVICE_GPU) {
+      Notification notification;
+      Device* dev = instances_[broadcast_dev_id]->device_;
+      auto* dev_info = dev->tensorflow_gpu_device_info();
+      CHECK(dev_info);
+      dev_info->default_context->CopyDeviceTensorToCPU(
+          t, "" /*tensor_name*/, dev, &cpu_copy,
+          [this, &notification](Status s) {
+            TF_CHECK_OK(s);
+            notification.Notify();
+          });
+      notification.WaitForNotification();
+      t = &cpu_copy;
+    }
+    for (size_t i = 0; i < t->NumElements(); ++i) {
+      expected[i] = t->flat<T>()(i);
+    }
+
+    Broadcast();
+
+    // At this point all of the ops have terminated.
+    for (int di = 0; di < instances_.size(); ++di) {
+      if (!instances_[di]->status_.ok()) {
+        ASSERT_GT(fail_after, 0);
+        ASSERT_EQ(instances_[di]->status_.error_message(),
+                  "Deliberate failure");
+        mutex_lock l(mu_);
+        ++failure_count_;
+        continue;
+      }
+      Tensor* inst = &instances_[di]->tensor_;
+      Tensor actual(dtype, TensorShape({tensor_len}));
+      if (device_type_ == DEVICE_CPU) {
+        CHECK(actual.CopyFrom(*inst, inst->shape()));
+      } else if (device_type_ == DEVICE_GPU) {
+        Notification notification;
+        Device* dev = instances_[di]->device_;
+        auto* dev_info = dev->tensorflow_gpu_device_info();
+        CHECK(dev_info);
+        dev_info->default_context->CopyDeviceTensorToCPU(
+            inst, "" /*tensor_name*/, dev, &actual,
+            [this, &notification](Status s) {
+              TF_CHECK_OK(s);
+              notification.Notify();
+            });
+        notification.WaitForNotification();
+      }
+      for (int i = 0; i < tensor_len; ++i) {
+        switch (dtype) {
+          case DT_FLOAT:
+            EXPECT_FLOAT_EQ(expected[i], actual.template flat<T>()(i))
+                << "Mismatch at device " << di << " index " << i;
+            break;
+          case DT_DOUBLE:
+            EXPECT_DOUBLE_EQ(expected[i], actual.template flat<T>()(i))
+                << "Mismatch at device " << di << " index " << i;
+            break;
+          case DT_INT32:
+          case DT_INT64:
+            EXPECT_EQ(expected[i], actual.template flat<T>()(i))
+                << "Mismatch at device " << di << " index " << i;
+            break;
+          default:
+            LOG(FATAL) << "unimplemented";
+        }
+      }
+    }
+
+    // Note that the order of operations during broadcast is
+    // non-deterministic and unlike the reduce case some Ops in the
+    // instance may succeed while others fail, even if a transmission
+    // failure occurs early in the operation chain.  So, when an abort
+    // is specified we need to verify that at least one Op fails with
+    // the expected status and any Op that succeeds yeilds the correct
+    // value.
+    if (fail_after > 0) {
+      mutex_lock l(mu_);
+      EXPECT_GT(failure_count_, 0);
+    }
+  }
+
+  class DeviceInstance {
+   public:
+    DeviceInstance(int rank, const string& dev_name,
+                   const DeviceType& device_type, BroadcasterTest* parent)
+        : parent_(parent),
+          dev_name_(dev_name),
+          device_type_(device_type),
+          rank_(rank) {
+      TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(dev_name, &device_));
+      col_params_.name = parent_->col_params_.name;
+      col_params_.instance.data_type = parent_->col_params_.instance.data_type;
+      col_params_.group.group_key = parent_->col_params_.group.group_key;
+      col_params_.instance.instance_key =
+          parent_->col_params_.instance.instance_key;
+      col_params_.group.device_type = parent_->col_params_.group.device_type;
+      col_params_.group.group_size = parent_->col_params_.group.group_size;
+      col_params_.instance.device_names =
+          parent_->col_params_.instance.device_names;
+      col_params_.instance.task_names =
+          parent_->col_params_.instance.task_names;
+      col_params_.task.is_local = parent_->col_params_.task.is_local;
+      col_params_.instance.impl_details.subdiv_permutations =
+          parent_->col_params_.instance.impl_details.subdiv_permutations;
+      col_params_.subdiv_rank = parent_->col_params_.subdiv_rank;
+
+      int group_size = col_params_.group.group_size;
+      CHECK_EQ(group_size, col_params_.instance.device_names.size());
+      // Default rank is order in device_names.
+      col_params_.default_rank = rank;
+      // perm_rank is order in subdiv[0]:
+      int perm_rank = -1;
+      for (int i = 0;
+           i < col_params_.instance.impl_details.subdiv_permutations[0].size();
+           ++i) {
+        if (rank ==
+            col_params_.instance.impl_details.subdiv_permutations[0][i]) {
+          perm_rank = i;
+          break;
+        }
+      }
+      CHECK_GE(perm_rank, 0);
+      col_params_.instance.impl_details.subdiv_source_rank.resize(1, 0);
+      col_params_.is_source =
+          (perm_rank ==
+           col_params_.instance.impl_details.subdiv_source_rank[0]);
+      // Set rank in all subdivs by finding that default_rank.
+      for (int sdi = 0; sdi < kNumSubdivs; ++sdi) {
+        for (int r = 0;
+             r <
+             col_params_.instance.impl_details.subdiv_permutations[sdi].size();
+             ++r) {
+          if (col_params_.default_rank ==
+              col_params_.instance.impl_details.subdiv_permutations[sdi][r]) {
+            col_params_.subdiv_rank[sdi] = r;
+            CHECK_EQ(0, sdi);
+            CHECK_EQ(perm_rank, col_params_.subdiv_rank[sdi]);
+            break;
+          }
+        }
+      }
+      CHECK_EQ(group_size, col_params_.task.is_local.size());
+      CHECK_EQ(group_size, col_params_.instance.task_names.size());
+    }
+
+    void InitTensor(DataType dtype, const TensorShape& shape,
+                    const InitFunc& f) {
+      tensor_ =
+          Tensor(device_->GetAllocator(AllocatorAttributes()), dtype, shape);
+      if (device_type_ == DEVICE_CPU) {
+        f(&tensor_);
+      } else if (device_type_ == DEVICE_GPU) {
+        Tensor cpu_tensor(dtype, shape);
+        f(&cpu_tensor);
+        Notification notification;
+        auto* dev_info = device_->tensorflow_gpu_device_info();
+        CHECK(dev_info);
+        dev_info->default_context->CopyCPUTensorToDevice(
+            &cpu_tensor, device_, &tensor_, [this, &notification](Status s) {
+              TF_CHECK_OK(s);
+              notification.Notify();
+            });
+        notification.WaitForNotification();
+      } else {
+        LOG(FATAL) << "Unsupported device_type " << device_type_;
+      }
+    }
+
+    void DoBroadcast() {
+      // Prepare an OpKernelContext.
+      OpKernelContext::Params op_params;
+      op_params.step_id = parent_->step_id_;
+      op_params.device = device_;
+      gtl::InlinedVector<TensorValue, 4> inputs;
+      inputs.push_back(TensorValue(&tensor_));
+      op_params.inputs = &inputs;
+      gtl::InlinedVector<AllocatorAttributes, 4> input_aa(
+          {AllocatorAttributes()});
+      op_params.input_alloc_attrs = &input_aa;
+      gtl::InlinedVector<DeviceContext*, 4> input_dc;
+      DeviceContext* dev_ctx = nullptr;
+      auto* dev_info = device_->tensorflow_gpu_device_info();
+      if (dev_info) {
+        dev_ctx = dev_info->default_context;
+        dev_ctx->Ref();
+      } else {
+        dev_ctx = new DeviceContext;
+      }
+      input_dc.push_back(dev_ctx);
+      op_params.input_device_contexts = &input_dc;
+      op_params.op_device_context = dev_ctx;
+      int forward_from[] = {0};
+      if (col_params_.is_source) {
+        op_params.forward_from_array = &forward_from[0];
+      }
+      AllocatorAttributes generic_alloc_attr;
+      op_params.output_attr_array = &generic_alloc_attr;
+      std::unique_ptr<OpKernel> op =
+          col_params_.is_source
+              ? parent_->GetCollectiveBcastSend(col_params_, &tensor_,
+                                                DEVICE_CPU, device_)
+              : parent_->GetCollectiveBcastRecv(col_params_, tensor_.shape(),
+                                                DEVICE_CPU, device_);
+      op_params.op_kernel = op.get();
+      OpKernelContext ctx(&op_params, 1);
+
+      Tensor* output_tensor_ptr = nullptr;
+      if (col_params_.is_source) {
+        TF_CHECK_OK(ctx.forward_input_or_allocate_output(
+            {0}, 0, tensor_.shape(), &output_tensor_ptr));
+      } else {
+        TF_CHECK_OK(
+            ctx.allocate_output(0, tensor_.shape(), &output_tensor_ptr));
+      }
+      CHECK_EQ(output_tensor_ptr, ctx.mutable_output(0));
+
+      // Prepare a Broadcaster instance.
+      string exec_key =
+          strings::StrCat(col_params_.instance.instance_key, ":0:0");
+      Broadcaster broadcaster(parent_->col_exec_, parent_->dev_mgr_.get(), &ctx,
+                              &op_params, col_params_, exec_key, kStepId,
+                              output_tensor_ptr);
+
+      // Start execution in a threadpool then wait for completion.
+      Notification notification;
+      broadcaster.Run([this, &notification](Status s) {
+        status_ = s;
+        notification.Notify();
+      });
+      notification.WaitForNotification();
+      if (status_.ok()) {
+        CHECK(tensor_.CopyFrom(*ctx.mutable_output(0), tensor_.shape()));
+      }
+
+      dev_ctx->Unref();
+    }
+
+    BroadcasterTest* parent_;
+    string dev_name_;
+    DeviceType device_type_ = DEVICE_CPU;
+    int rank_;
+    Tensor tensor_;
+    Device* device_;
+    CollectiveParams col_params_;
+    std::unique_ptr<CollectiveAdapter> ca_;
+    std::unique_ptr<OpKernelContext> ctx_;
+    Status status_;
+  };  // class DeviceInstance
+
+  bool stop_ = false;
+  int64 step_id_ = kStepId;
+  int broadcast_dev_id_ = 0;
+  DeviceType device_type_;
+  TestCollectiveExecutorMgr col_exec_mgr_;
+  CollectiveExecutor* col_exec_ = nullptr;
+  CollectiveRemoteAccessLocal* rma_;
+  std::unique_ptr<DeviceResolverLocal> dev_resolver_;
+  std::vector<DeviceInstance*> instances_;
+  CollectiveParams col_params_;
+  std::vector<tensorflow::Device*> gpu_devices_;
+  std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
+  mutex mu_;
+  int bcast_recv_counter_ GUARDED_BY(mu_) = 0;
+  int bcast_send_counter_ GUARDED_BY(mu_) = 0;
+  int failure_count_ GUARDED_BY(mu_) = 0;
+};
+
+// Tests of full broadcast algorithm, with different device and
+// data types.
+// B = data element type
+// T = device type
+// W = number of workers
+// D = number of devices per worker
+// L = tensor length
+// A = abort after count
+#define DEF_TEST(B, T, W, D, L, A)                                 \
+  TEST_F(BroadcasterTest,                                          \
+         DaTy##B##_DevTy##T##_Wkr##W##_Dev##D##_Len##L##_Abt##A) { \
+    DataType dtype = DT_##B;                                       \
+    switch (dtype) {                                               \
+      case DT_FLOAT: {                                             \
+        RunTest<float>(dtype, DEVICE_##T, W, D, L, A);             \
+      } break;                                                     \
+      case DT_DOUBLE: {                                            \
+        RunTest<double>(dtype, DEVICE_##T, W, D, L, A);            \
+      } break;                                                     \
+      case DT_INT32: {                                             \
+        RunTest<int32>(dtype, DEVICE_##T, W, D, L, A);             \
+      } break;                                                     \
+      case DT_INT64: {                                             \
+        RunTest<int64>(dtype, DEVICE_##T, W, D, L, A);             \
+      } break;                                                     \
+      default:                                                     \
+        LOG(FATAL) << "Unimplemented";                             \
+    }                                                              \
+  }
+
+#ifndef GOOGLE_CUDA
+//       B      T    W  D  L  A
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1001, 0)
+DEF_TEST(FLOAT, CPU, 2, 1, 128, 0)
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 4095, 0)
+DEF_TEST(FLOAT, CPU, 4, 4, 1045991, 0)
+
+DEF_TEST(DOUBLE, CPU, 2, 4, 128, 0)
+DEF_TEST(INT32, CPU, 2, 4, 128, 0)
+DEF_TEST(INT64, CPU, 2, 4, 128, 0)
+
+// Failure cases
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 1)
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 5)
+#endif
+
+#ifdef GOOGLE_CUDA
+// Can only set W=1 for GPU tests.
+//       B      T    W  D  L  A
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 33, 0)
+DEF_TEST(FLOAT, GPU, 1, 3, 64, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1001, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 4095, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1045991, 0)
+
+DEF_TEST(DOUBLE, GPU, 1, 8, 1001, 0)
+DEF_TEST(INT64, GPU, 1, 8, 1001, 0)
+
+// Failure cases
+DEF_TEST(FLOAT, GPU, 1, 8, 128, 6)
+#endif
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index 393d3f824d..bdddf927d8 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -250,6 +250,38 @@ GlobalDeviceMap EstablishGlobalRank(
   return gdm;
 }
 
+// Count the devices associated with each task and set
+// cp->same_num_devices_per_task.  Requires cp->instance.task_names
+// be sorted.
+void SetDevPerTask(CollectiveParams* cp) {
+  cp->instance.same_num_devices_per_task = false;
+  if (cp->instance.task_names.empty()) return;
+  int dev_per_task = -1;
+  int count = 0;
+  const string* last_task_name = &cp->instance.task_names[0];
+  for (const string& task_name : cp->instance.task_names) {
+    if (task_name != *last_task_name) {
+      CHECK_GT(count, 0);
+      if (dev_per_task < 0) {
+        dev_per_task = count;
+      } else {
+        CHECK_GT(dev_per_task, 0);
+        if (count != dev_per_task) return;
+      }
+      count = 1;
+      last_task_name = &task_name;
+    } else {
+      ++count;
+    }
+  }
+  CHECK_GT(count, 0);
+  if ((dev_per_task > 0) && (count != dev_per_task)) {
+    return;
+  }
+  cp->instance.same_num_devices_per_task = true;
+  CHECK_EQ((cp->group.group_size % cp->group.num_tasks), 0);
+}
+
 // Sort cp->instance.device_names lexicographically, but do by first
 // computing a reordering permutation so we can keep cp->instance.task_names
 // in corresponding order.
@@ -278,6 +310,7 @@ void SortDevicesAndTasks(CollectiveParams* cp) {
   cp->instance.device_names = std::move(new_devs);
   cp->instance.task_names = std::move(new_tasks);
   VLOG(1) << "Modified device_names on " << cp;
+  SetDevPerTask(cp);
 }
 
 // Establish the requested number of subdivision permutations based on the
@@ -343,17 +376,18 @@ void GenerateSubdivPerms(const string& device, int source_rank,
 
   if (cp->instance.type == BROADCAST_COLLECTIVE) {
     CHECK_GE(source_rank, 0);
-    cp->subdiv_source_rank.resize(
+    cp->instance.impl_details.subdiv_source_rank.resize(
         cp->instance.impl_details.subdiv_offsets.size(), -1);
-    for (int sdi = 0; sdi < cp->subdiv_source_rank.size(); ++sdi) {
+    for (int sdi = 0; sdi < cp->instance.impl_details.subdiv_source_rank.size();
+         ++sdi) {
       for (int j = 0; j < cp->group.group_size; ++j) {
         if (cp->instance.impl_details.subdiv_permutations[sdi][j] ==
             source_rank) {
-          cp->subdiv_source_rank[sdi] = j;
+          cp->instance.impl_details.subdiv_source_rank[sdi] = j;
           break;
         }
       }
-      CHECK_GE(cp->subdiv_source_rank[sdi], 0);
+      CHECK_GE(cp->instance.impl_details.subdiv_source_rank[sdi], 0);
     }
   }
 
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
index 4e3c7125f2..4e33c4779a 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
@@ -91,9 +91,10 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsReduction1Task) {
       EXPECT_TRUE(cps[i].task.is_local[j]);
     }
     EXPECT_EQ(cps[i].subdiv_rank[0], i);
-    EXPECT_EQ(cps[i].subdiv_source_rank.size(), 0);
+    EXPECT_EQ(cps[i].instance.impl_details.subdiv_source_rank.size(), 0);
     EXPECT_FALSE(cps[i].is_source);
     EXPECT_EQ(cps[i].default_rank, i);
+    EXPECT_TRUE(cps[i].instance.same_num_devices_per_task);
   }
 }
 
@@ -138,10 +139,11 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcast1Task) {
     }
     ASSERT_GT(cps[i].subdiv_rank.size(), 0);
     EXPECT_EQ(cps[i].subdiv_rank[0], i);
-    ASSERT_GT(cps[i].subdiv_source_rank.size(), 0);
-    EXPECT_EQ(cps[i].subdiv_source_rank[0], 1);
+    ASSERT_GT(cps[i].instance.impl_details.subdiv_source_rank.size(), 0);
+    EXPECT_EQ(cps[i].instance.impl_details.subdiv_source_rank[0], 1);
     EXPECT_EQ(cps[i].is_source, (i == 1));
     EXPECT_EQ(cps[i].default_rank, i);
+    EXPECT_TRUE(cps[i].instance.same_num_devices_per_task);
   }
 }
 
diff --git a/tensorflow/core/common_runtime/collective_rma_local.h b/tensorflow/core/common_runtime/collective_rma_local.h
index d25dd5f04a..716e23bfa1 100644
--- a/tensorflow/core/common_runtime/collective_rma_local.h
+++ b/tensorflow/core/common_runtime/collective_rma_local.h
@@ -67,6 +67,8 @@ class CollectiveRemoteAccessLocal : public PerStepCollectiveRemoteAccess {
     dev_resolver_->ClearTask(task);
   }
 
+  BufRendezvous* buf_rendezvous() override { return &buf_rendezvous_; }
+
   // Copy utility that always copies bytes from src to dst even if
   // they are on the same device, unlike CopyTensor::ViaDMA which will
   // just change the dst buffer pointer in that case.
diff --git a/tensorflow/core/framework/collective.cc b/tensorflow/core/framework/collective.cc
index a26f2c2f31..d4ac50cbbe 100644
--- a/tensorflow/core/framework/collective.cc
+++ b/tensorflow/core/framework/collective.cc
@@ -38,6 +38,7 @@ CollInstanceParams& CollInstanceParams::operator=(
     device_names.clear();
     device_names.assign(other.device_names.begin(), other.device_names.end());
     task_names.assign(other.task_names.begin(), other.task_names.end());
+    same_num_devices_per_task = other.same_num_devices_per_task;
     impl_details.subdiv_offsets.assign(
         other.impl_details.subdiv_offsets.begin(),
         other.impl_details.subdiv_offsets.end());
@@ -76,6 +77,13 @@ string CollInstanceParams::ToString() const {
     }
     strings::StrAppend(&v, "}");  // one subdiv
   }
+  if (!impl_details.subdiv_source_rank.empty()) {
+    strings::StrAppend(&v, " subdiv_source_rank={");
+    for (const auto& r : impl_details.subdiv_source_rank) {
+      strings::StrAppend(&v, r, ",");
+    }
+    strings::StrAppend(&v, "}");
+  }
   strings::StrAppend(&v, "}");  // all subdivs
   return v;
 }
@@ -98,13 +106,6 @@ string CollectiveParams::ToString() const {
   for (const auto& r : subdiv_rank) {
     strings::StrAppend(&v, r, ",");
   }
-  if (!subdiv_source_rank.empty()) {
-    strings::StrAppend(&v, " subdiv_rank={");
-    for (const auto& r : subdiv_source_rank) {
-      strings::StrAppend(&v, r, ",");
-    }
-    strings::StrAppend(&v, "}");
-  }
   strings::StrAppend(&v, "}}");
   return v;
 }
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index 5810c7fa54..40d82ab0e9 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -79,6 +79,8 @@ struct CollInstanceParams {
   std::vector<string> device_names;
   // Task name prefix of corresponding device name.
   std::vector<string> task_names;
+  // True if every task has the same number of devices.
+  bool same_num_devices_per_task;
   CollImplDetails impl_details;
   string ToString() const;
   CollInstanceParams& operator=(const struct CollInstanceParams& other);
@@ -102,7 +104,6 @@ struct CollectiveParams {
   bool is_source;    // broadcast only
   // Rank of this device in each subdivision permutation.
   std::vector<int> subdiv_rank;
-  std::vector<int> subdiv_source_rank;
   std::unique_ptr<OpKernel> merge_op;  // reduction only
   std::unique_ptr<OpKernel> final_op;  // reduction only
   string ToString() const;
@@ -284,12 +285,14 @@ class CollectiveExecutor : public PeerAccessInterface, public core::RefCounted {
   TF_DISALLOW_COPY_AND_ASSIGN(CollectiveExecutor);
 };
 
-// Interface of a helper object that provices a CollectiveExecutor with
+// Interface of a helper object that provides a CollectiveExecutor with
 // all of the remote access it needs.
 class CollectiveRemoteAccess : public PeerAccessInterface,
                                public DeviceResolverInterface {
  public:
   virtual ~CollectiveRemoteAccess() {}
+
+  virtual BufRendezvous* buf_rendezvous() = 0;
 };
 
 // A per-step version of CollectiveRemoteAccess that cleans up outstanding
-- 
GitLab


From 55706e693ab20f6200061fb73067cbf27707cccd Mon Sep 17 00:00:00 2001
From: Igor Saprykin <isaprykin@google.com>
Date: Thu, 19 Apr 2018 13:19:27 -0700
Subject: [PATCH 169/450] Support various shapes in TPU DistributionStrategy.

PiperOrigin-RevId: 193563912
---
 .../distribute/python/minimize_loss_test.py   | 11 +---
 .../distribute/python/single_loss_example.py  |  5 +-
 .../contrib/distribute/python/tpu_strategy.py | 61 +++++++++++++------
 .../contrib/distribute/python/values.py       | 33 ++++++++++
 4 files changed, 80 insertions(+), 30 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index 6c73250ded..43b2e91cbf 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -57,25 +57,18 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       model_fn, dataset_fn, layer = minimize_loss_example(
           optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
 
-      def tpu_dataset_fn():
-        return dataset_fn().batch(2)
       # TODO(isaprykin):  Eliminate `is_tpu`. Probably add a
       # `DistributionStrategy.create_monitor` so that each DistributionStrategy
       # could influence its training loop. That method would return an instance
       # of Monitor.  TPUMonitor would execute tpu.initialize_system() and
       # tpu.shutdown_system().
       iterator = distribution.distribute_dataset(
-          tpu_dataset_fn if is_tpu else dataset_fn).make_one_shot_iterator()
+          dataset_fn).make_one_shot_iterator()
 
       def run_step():
-        # TODO(isaprykin): Make iterator get_next() return a list of sub-
-        # batches for each iteration. Pass iterator.get_next() and not iterator
-        # to call_for_each_tower.
         return distribution.group(
             distribution.call_for_each_tower(
-                model_fn,
-                iterator.get_next() if not is_tpu else iterator,
-                run_concurrently=layer.built))
+                model_fn, iterator.get_next(), run_concurrently=layer.built))
 
       if not context.executing_eagerly():
         with self.test_session() as sess:
diff --git a/tensorflow/contrib/distribute/python/single_loss_example.py b/tensorflow/contrib/distribute/python/single_loss_example.py
index 9e8f919c8a..abd13c6cc6 100644
--- a/tensorflow/contrib/distribute/python/single_loss_example.py
+++ b/tensorflow/contrib/distribute/python/single_loss_example.py
@@ -54,7 +54,7 @@ def minimize_loss_example(optimizer_fn,
   """Example of non-distribution-aware legacy code."""
 
   def dataset_fn():
-    return dataset_ops.Dataset.from_tensors([[1.]]).repeat()
+    return dataset_ops.Dataset.from_tensors([[1.]]).repeat().batch(2)
 
   # An Optimizer instance is created either outside or inside model_fn.
   outer_optimizer = None
@@ -63,10 +63,11 @@ def minimize_loss_example(optimizer_fn,
 
   layer = core.Dense(1, use_bias=use_bias)
 
-  def model_fn(x):
+  def model_fn(xs):
     """A very simple model written by the user."""
 
     def loss_fn():
+      x = math_ops.reduce_mean(xs, keepdims=True)
       y = array_ops.reshape(layer(x), []) - constant_op.constant(1.)
       return y * y
 
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index 804217b5ce..ceb52ceca7 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 
 from tensorflow.contrib import tpu
 from tensorflow.contrib.distribute.python import one_device_strategy
+from tensorflow.contrib.distribute.python import values
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -33,35 +34,48 @@ from tensorflow.python.ops import control_flow_ops
 
 # TODO(isaprykin):  Consider whether inheriting is really appropriate.
 class TPUStrategy(one_device_strategy.OneDeviceStrategy):
+  """Experimental TPU distribution strategy implementation."""
 
-  def __init__(self, master=None, iterations=None, model_dir=None):
+  def __init__(self,
+               global_batch_size=2,
+               num_cores_per_host=2,
+               iterations_per_step=2):
+    # TODO(isaprykin): Generalize the defaults.
     super(TPUStrategy, self).__init__('/cpu:0')
+    # TODO(isaprykin): Auto-detect number of cores and hosts.
+    self._num_cores_per_host = num_cores_per_host
+    self._global_batch_size = global_batch_size
+    # TODO(isaprykin): This might have to be per-call.
+    self._iterations_per_step = iterations_per_step
+
+  def distribute_dataset(self, dataset_fn):
+    return values.PerIterationDataset(
+        self._call_dataset_fn(dataset_fn), self._iterations_per_step)
 
   def _call_for_each_tower(self, fn, *args, **kwargs):
     kwargs.pop('run_concurrently', None)
 
-    # TODO(isaprykin): Give an API for many iterations per step.
-    iterations = 1
+    # TODO(isaprykin): Support variable arguments similar to PerDevice+regroup.
+    inputs = args[0]
 
-    # TODO(isaprykin): Do not hard code shapes and input format :)
-    # TODO(isaprykin): Detect the number of TPU cores automatically.
-
-    def dequeueing_fn(*args, **kwargs):
-      del args, kwargs
-      x, = tpu.infeed_dequeue_tuple(dtypes=[dtypes.float32], shapes=[[1, 1, 1]])
-      return fn(x)
-
-    iterator = args[0]
+    sharded_shape = [None]  # Python 2 nonlocal.
 
     def infeed_input(i):
       """Get input, split it and then enqueue."""
-      batches = iterator.get_next()
-      batches = array_ops.split(batches, 2)
+      batches = array_ops.gather(inputs, i)
+
+      # TODO(isaprykin):  Handle partial batch.
+      global_shape = [self._global_batch_size] + list(batches.get_shape())[1:]
+      sharded_shape[0] = ([self._global_batch_size / self._num_cores_per_host] +
+                          list(global_shape)[1:])
+
+      batches.set_shape(global_shape)
+      batches = array_ops.split(batches, self._num_cores_per_host)
 
       infeeds = [
           tpu_ops.infeed_enqueue_tuple(
-              inputs=[batches[j]], shapes=[[1, 1, 1]], device_ordinal=j)
-          for j in range(2)
+              inputs=[batches[j]], shapes=[sharded_shape[0]], device_ordinal=j)
+          for j in range(self._num_cores_per_host)
       ]
 
       with ops.control_dependencies(infeeds):
@@ -69,14 +83,23 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
 
     with ops.device('/task:0/device:CPU:0'):
       enqueue_ops = control_flow_ops.while_loop(
-          lambda i: i < iterations,
+          lambda i: i < self._iterations_per_step,
           infeed_input, [constant_op.constant(0)],
           parallel_iterations=1)
 
+    assert sharded_shape[0]
+
+    def dequeueing_fn(*args, **kwargs):
+      del args, kwargs
+      x, = tpu.infeed_dequeue_tuple(
+          dtypes=[dtypes.float32], shapes=[sharded_shape[0]])
+      return fn(x)
+
     def iterate_on_tpu():
-      return tpu.repeat(iterations, dequeueing_fn, [])
+      return tpu.repeat(self._iterations_per_step, dequeueing_fn, [])
 
     with one_device_strategy._OneDeviceTowerContext(self):  # pylint: disable=protected-access
-      tpu_result = tpu.batch_parallel(iterate_on_tpu, [], num_shards=2)
+      tpu_result = tpu.batch_parallel(
+          iterate_on_tpu, [], num_shards=self._num_cores_per_host)
 
     return control_flow_ops.group(tpu_result, enqueue_ops)
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index 18fedd2775..62016c3a78 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -570,6 +570,39 @@ class PerDeviceDataset(object):
         dataset_iterator, self._devices, self._prefetch_on_device)
 
 
+class MultiIterator(object):
+  """Iterator that returns results of multiple get_next()s."""
+
+  def __init__(self, dataset_iterator, iterations):
+    self._dataset_iterator = dataset_iterator
+    self._iterations = iterations
+
+  def get_next(self, name=None):
+    return [
+        self._dataset_iterator.get_next(name=name)
+        for _ in range(self._iterations)
+    ]
+
+  @property
+  def initializer(self):
+    return self._dataset_iterator.initializer
+
+
+class PerIterationDataset(object):
+
+  def __init__(self, dataset, iterations):
+    self._dataset = dataset
+    self._iterations = iterations
+
+  def make_one_shot_iterator(self):
+    iterator = self._dataset.make_one_shot_iterator()
+    return MultiIterator(iterator, self._iterations)
+
+  def make_initializable_iterator(self):
+    iterator = self._dataset.make_initializable_iterator()
+    return MultiIterator(iterator, self._iterations)
+
+
 class MapOutput(object):
   """Map can result in multiple outputs per device."""
 
-- 
GitLab


From 7f1e64eb94447665047fac16c67b5351bcf3c8a3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 13:21:25 -0700
Subject: [PATCH 170/450] Allow output has a different shape from input in the
 image.transform (#17011).

PiperOrigin-RevId: 193564222
---
 tensorflow/contrib/image/kernels/image_ops.cc |  7 ++-
 tensorflow/contrib/image/kernels/image_ops.h  |  2 +-
 tensorflow/contrib/image/ops/image_ops.cc     | 52 +++++++++++++++++--
 .../python/kernel_tests/image_ops_test.py     | 30 +++++++++++
 .../contrib/image/python/ops/image_ops.py     | 39 ++++++++------
 5 files changed, 107 insertions(+), 23 deletions(-)

diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc
index c2e32da133..ae4b1ba62a 100644
--- a/tensorflow/contrib/image/kernels/image_ops.cc
+++ b/tensorflow/contrib/image/kernels/image_ops.cc
@@ -70,6 +70,7 @@ class ImageProjectiveTransform : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor& images_t = ctx->input(0);
     const Tensor& transform_t = ctx->input(1);
+    const Tensor& output_dim = ctx->input(2);
     OP_REQUIRES(ctx, images_t.shape().dims() == 4,
                 errors::InvalidArgument("Input images must have rank 4"));
     OP_REQUIRES(ctx,
@@ -83,7 +84,11 @@ class ImageProjectiveTransform : public OpKernel {
     auto images = images_t.tensor<T, 4>();
     auto transform = transform_t.matrix<float>();
     Tensor* output_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, images_t.shape(), &output_t));
+    // Image is NHWC format.
+    auto output_shape = images_t.shape();
+    output_shape.set_dim(1, output_dim.vec<int>()(0));
+    output_shape.set_dim(2, output_dim.vec<int>()(1));
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_t));
     auto output = output_t->tensor<T, 4>();
     (FillProjectiveTransform<Device, T>(interpolation_))(
         ctx->eigen_device<Device>(), &output, images, transform);
diff --git a/tensorflow/contrib/image/kernels/image_ops.h b/tensorflow/contrib/image/kernels/image_ops.h
index ad50133061..2320329b92 100644
--- a/tensorflow/contrib/image/kernels/image_ops.h
+++ b/tensorflow/contrib/image/kernels/image_ops.h
@@ -161,7 +161,7 @@ struct FillProjectiveTransform {
   void operator()(const Device& device, OutputType* output,
                   const InputType& images,
                   const TransformsType& transform) const {
-    output->device(device) = images.generate(
+    output->device(device) = output->generate(
         ProjectiveGenerator<Device, T>(images, transform, interpolation_));
   }
 };
diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc
index 68771b3d05..4c6d8c0d19 100644
--- a/tensorflow/contrib/image/ops/image_ops.cc
+++ b/tensorflow/contrib/image/ops/image_ops.cc
@@ -19,9 +19,55 @@ limitations under the License.
 
 namespace tensorflow {
 
+using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
+namespace {
+
+// Sets output[0] to shape [batch_dim,height,width,channel_dim], where
+// height and width come from the size_tensor.
+Status SetOutputToSizedImage(InferenceContext* c, DimensionHandle batch_dim,
+                             int size_input_idx, DimensionHandle channel_dim) {
+  // Verify shape of size input.
+  ShapeHandle size;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(size_input_idx), 1, &size));
+  DimensionHandle unused;
+  TF_RETURN_IF_ERROR(c->WithValue(c->Dim(size, 0), 2, &unused));
+
+  // Get size values from the size tensor.
+  const Tensor* size_tensor = c->input_tensor(size_input_idx);
+  DimensionHandle width;
+  DimensionHandle height;
+  if (size_tensor == nullptr) {
+    width = c->UnknownDim();
+    height = c->UnknownDim();
+  } else {
+    // TODO(petewarden) - Remove once we have constant evaluation in C++ only.
+    if (size_tensor->dtype() != DT_INT32) {
+      return errors::InvalidArgument(
+          "Bad size input type for SetOutputToSizedImage: Expected DT_INT32 "
+          "but got ",
+          DataTypeString(size_tensor->dtype()), " for input #", size_input_idx,
+          " in ", c->DebugString());
+    }
+    auto vec = size_tensor->vec<int32>();
+    height = c->MakeDim(vec(0));
+    width = c->MakeDim(vec(1));
+  }
+  c->set_output(0, c->MakeShape({batch_dim, height, width, channel_dim}));
+  return Status::OK();
+}
+
+Status ResizeShapeFn(InferenceContext* c) {
+  ShapeHandle input;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
+  return SetOutputToSizedImage(c, c->Dim(input, 0), 2 /* size_input_idx */,
+                               c->Dim(input, 3));
+}
+
+}  // namespace
+
 // TODO(ringwalt): Add a "fill_mode" argument with "constant", "mirror", etc.
 // TODO(ringwalt): Add a "fill_constant" argument for constant mode (default 0).
 // TODO(ringwalt): Add an "output_shape" argument. This is sufficient to
@@ -29,13 +75,11 @@ using shape_inference::ShapeHandle;
 REGISTER_OP("ImageProjectiveTransform")
     .Input("images: dtype")
     .Input("transforms: float32")
+    .Input("output_shape: int32")
     .Attr("dtype: {uint8, int32, int64, float32, float64}")
     .Attr("interpolation: string")
     .Output("transformed_images: dtype")
-    .SetShapeFn([](InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      return Status::OK();
-    })
+    .SetShapeFn(ResizeShapeFn)
     .Doc(R"doc(
 Applies the given transform to each of the images.
 
diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
index b50177ae56..c0151d320f 100644
--- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
@@ -195,10 +195,40 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
           x_init_value=test_image)
       self.assertLess(left_err, 1e-10)
 
+  def _test_grad_different_shape(self, input_shape, output_shape):
+    with self.test_session():
+      test_image_shape = input_shape
+      test_image = np.random.randn(*test_image_shape)
+      test_image_tensor = constant_op.constant(
+          test_image, shape=test_image_shape)
+      test_transform = image_ops.angles_to_projective_transforms(
+          np.pi / 2, 4, 4)
+
+      if len(output_shape) == 2:
+        resize_shape = output_shape
+      elif len(output_shape) == 3:
+        resize_shape = output_shape[0:2]
+      elif len(output_shape) == 4:
+        resize_shape = output_shape[1:3]
+      output = image_ops.transform(
+          images=test_image_tensor,
+          transforms=test_transform,
+          output_shape=resize_shape)
+      left_err = gradient_checker.compute_gradient_error(
+          test_image_tensor,
+          test_image_shape,
+          output,
+          output_shape,
+          x_init_value=test_image)
+      self.assertLess(left_err, 1e-10)
+
   def test_grad(self):
     self._test_grad([16, 16])
     self._test_grad([4, 12, 12])
     self._test_grad([3, 4, 12, 12])
+    self._test_grad_different_shape([16, 16], [8, 8])
+    self._test_grad_different_shape([4, 12, 3], [8, 24, 3])
+    self._test_grad_different_shape([3, 4, 12, 3], [3, 8, 24, 3])
 
 
 class BipartiteMatchTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index c139ae89d8..0cb7bdc75d 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -212,7 +212,11 @@ def translations_to_projective_transforms(translations, name=None):
         axis=1)
 
 
-def transform(images, transforms, interpolation="NEAREST", name=None):
+def transform(images,
+              transforms,
+              output_shape=None,
+              interpolation="NEAREST",
+              name=None):
   """Applies the given transform(s) to the image(s).
 
   Args:
@@ -228,7 +232,10 @@ def transform(images, transforms, interpolation="NEAREST", name=None):
        where `k = c0 x + c1 y + 1`. The transforms are *inverted* compared to
        the transform mapping input points to output points. Note that gradients
        are not backpropagated into transformation parameters.
+    output_shape: Output dimesion after the transform, [height, width].
+       If None, output is the same size as input image.
     interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR".
+    name: The name of the op.
 
   Returns:
     Image(s) with the same type and shape as `images`, with the given
@@ -255,6 +262,14 @@ def transform(images, transforms, interpolation="NEAREST", name=None):
     else:
       raise TypeError("Images should have rank between 2 and 4.")
 
+    if output_shape is None:
+      output_shape = images.get_shape()[1:3]
+    elif len(output_shape) != 2:
+      raise TypeError(
+          "output_shape must either be None or a vector of 2 elements.")
+    output_shape = ops.convert_to_tensor(
+        output_shape, name="output_shape", dtype=dtypes.int32)
+
     if len(transform_or_transforms.get_shape()) == 1:
       transforms = transform_or_transforms[None]
     elif transform_or_transforms.get_shape().ndims is None:
@@ -265,7 +280,7 @@ def transform(images, transforms, interpolation="NEAREST", name=None):
     else:
       raise TypeError("Transforms should have rank 1 or 2.")
     output = gen_image_ops.image_projective_transform(
-        images, transforms, interpolation=interpolation.upper())
+        images, transforms, output_shape, interpolation=interpolation.upper())
     if len(image_or_images.get_shape()) == 2:
       return output[0, :, :, 0]
     elif len(image_or_images.get_shape()) == 3:
@@ -375,14 +390,6 @@ def _image_projective_transform_grad(op, grad):
 
   if image_or_images.dtype.base_dtype not in _IMAGE_DTYPES:
     raise TypeError("Invalid dtype %s." % image_or_images.dtype)
-  if len(image_or_images.get_shape()) == 2:
-    images = image_or_images[None, :, :, None]
-  elif len(image_or_images.get_shape()) == 3:
-    images = image_or_images[None, :, :, :]
-  elif len(image_or_images.get_shape()) == 4:
-    images = image_or_images
-  else:
-    raise TypeError("Images should have rank between 2 and 4")
   if len(transform_or_transforms.get_shape()) == 1:
     transforms = transform_or_transforms[None]
   elif len(transform_or_transforms.get_shape()) == 2:
@@ -395,13 +402,11 @@ def _image_projective_transform_grad(op, grad):
   inverse = linalg_ops.matrix_inverse(transforms)
   transforms = matrices_to_flat_transforms(inverse)
   output = gen_image_ops.image_projective_transform(
-      grad, transforms, interpolation=interpolation)
-  if len(image_or_images.get_shape()) == 2:
-    return [output[0, :, :, 0], None]
-  elif len(image_or_images.get_shape()) == 3:
-    return [output[0, :, :, :], None]
-  else:
-    return [output, None]
+      images=grad,
+      transforms=transforms,
+      output_shape=image_or_images.get_shape()[1:3],
+      interpolation=interpolation)
+  return [output, None, None]
 
 
 def bipartite_match(distance_mat,
-- 
GitLab


From ab47eb8d9bcac55fd19b0e862cf9a2a7de195787 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Thu, 19 Apr 2018 13:38:43 -0700
Subject: [PATCH 171/450] tools/lib_package: Fix typo in README

PiperOrigin-RevId: 193566850
---
 tensorflow/tools/lib_package/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/tools/lib_package/README.md b/tensorflow/tools/lib_package/README.md
index 7008148260..cb6aef2624 100644
--- a/tensorflow/tools/lib_package/README.md
+++ b/tensorflow/tools/lib_package/README.md
@@ -35,8 +35,8 @@ The following commands:
 bazel test --config opt //tensorflow/tools/lib_package:libtensorflow_test
 bazel build --config opt \
   //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz \
-  //tensorflow/tools/lib_package:libtensorflow.jar \
-  //tensorflow/tools/lib_package:libtensorflow-src.jar
+  //tensorflow/java:libtensorflow.jar \
+  //tensorflow/java:libtensorflow-src.jar
 ```
 
 test and produce the following:
@@ -44,9 +44,9 @@ test and produce the following:
 -   The native library (`libtensorflow_jni.so`) packaged in an archive at:
     `bazel-bin/tensorflow/tools/lib_package/libtensorflow_jni.tar.gz`
 -   The Java archive at:
-    `bazel-bin/tensorflow/tools/lib_package/libtensorflow.jar`
+    `bazel-bin/tensorflow/java/libtensorflow.jar`
 -   The Java archive for Java sources at:
-    `bazel-bin/tensorflow/tools/lib_package/libtensorflow-src.jar`
+    `bazel-bin/tensorflow/java/libtensorflow-src.jar`
 
 ## Release
 
-- 
GitLab


From 1e7289fc0e64a706bb1867cfe5a8c5f5d2f7150f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 14:05:06 -0700
Subject: [PATCH 172/450] Make flat_transforms_to_matrices and
 matrices_to_flat_transforms public available.

PiperOrigin-RevId: 193571089
---
 tensorflow/contrib/image/__init__.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/contrib/image/__init__.py b/tensorflow/contrib/image/__init__.py
index e982030bc8..8f406ace1d 100755
--- a/tensorflow/contrib/image/__init__.py
+++ b/tensorflow/contrib/image/__init__.py
@@ -25,6 +25,8 @@ projective transforms (including rotation) are supported.
 @@angles_to_projective_transforms
 @@compose_transforms
 @@adjust_yiq_hsv
+@@flat_transforms_to_matrices
+@@matrices_to_flat_transforms
 @@random_yiq_hsv
 @@rotate
 @@transform
@@ -58,6 +60,8 @@ from tensorflow.contrib.image.python.ops.distort_image_ops import random_hsv_in_
 from tensorflow.contrib.image.python.ops.image_ops import angles_to_projective_transforms
 from tensorflow.contrib.image.python.ops.image_ops import compose_transforms
 from tensorflow.contrib.image.python.ops.image_ops import connected_components
+from tensorflow.contrib.image.python.ops.image_ops import flat_transforms_to_matrices
+from tensorflow.contrib.image.python.ops.image_ops import matrices_to_flat_transforms
 from tensorflow.contrib.image.python.ops.image_ops import rotate
 from tensorflow.contrib.image.python.ops.image_ops import transform
 from tensorflow.contrib.image.python.ops.image_ops import translate
-- 
GitLab


From ab5abfa42bdced7bf1c371e5e1224bdc1fafdcc1 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Thu, 19 Apr 2018 14:10:01 -0700
Subject: [PATCH 173/450] RecordReader: Simplify interface contract and
 implementation.

Prior to this change, RecordReader had the following contract:
- Records can be read in any order, EXCEPT if compression or buffering was
  enabled.
- If the underlying file is being concurrently written to
  then calls to ReadRecord() may fail (because of an incomplete
  record near the end of a file), but a retry may succeed (once the
  record is written), EXCEPT if compression or buffering is enabled
  (in which case the failure will be terminal).

  This "retry-may-succeed" behavior is relied upon by tensorboard
  (https://github.com/tensorflow/tensorboard/blob/1.7/tensorboard/backend/event_processing/event_file_loader.py#L55)
  where one process (typically the model training process) is writing
  tf.summary events to an event file and another process (tensorboard)
  is concurrently reading it.

With this change, the intent is to remove the EXCEPTions and have the
same behavior irrespective of compression/buffering.

Additionally, fix a memory leak when ZlibInputStream::Reset() is invoked.

PiperOrigin-RevId: 193571934
---
 tensorflow/core/lib/io/record_reader.cc    | 147 ++++----------
 tensorflow/core/lib/io/record_reader.h     |  16 +-
 tensorflow/core/lib/io/recordio_test.cc    | 212 ++++++++++++++-------
 tensorflow/core/lib/io/zlib_inputstream.cc |   9 +-
 tensorflow/core/lib/io/zlib_inputstream.h  |  10 +-
 5 files changed, 206 insertions(+), 188 deletions(-)

diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index 6de850bb20..c24628be57 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -56,110 +56,55 @@ RecordReaderOptions RecordReaderOptions::CreateRecordReaderOptions(
 
 RecordReader::RecordReader(RandomAccessFile* file,
                            const RecordReaderOptions& options)
-    : src_(file), options_(options) {
+    : options_(options),
+      input_stream_(new RandomAccessInputStream(file)),
+      last_read_failed_(false) {
   if (options.buffer_size > 0) {
-    input_stream_.reset(new BufferedInputStream(file, options.buffer_size));
-  } else {
-    input_stream_.reset(new RandomAccessInputStream(file));
+    input_stream_.reset(new BufferedInputStream(input_stream_.release(),
+                                                options.buffer_size, true));
   }
   if (options.compression_type == RecordReaderOptions::ZLIB_COMPRESSION) {
 // We don't have zlib available on all embedded platforms, so fail.
 #if defined(IS_SLIM_BUILD)
     LOG(FATAL) << "Zlib compression is unsupported on mobile platforms.";
 #else   // IS_SLIM_BUILD
-    zlib_input_stream_.reset(new ZlibInputStream(
-        input_stream_.get(), options.zlib_options.input_buffer_size,
-        options.zlib_options.output_buffer_size, options.zlib_options));
+    input_stream_.reset(new ZlibInputStream(
+        input_stream_.release(), options.zlib_options.input_buffer_size,
+        options.zlib_options.output_buffer_size, options.zlib_options, true));
 #endif  // IS_SLIM_BUILD
   } else if (options.compression_type == RecordReaderOptions::NONE) {
     // Nothing to do.
   } else {
-    LOG(FATAL) << "Unspecified compression type :" << options.compression_type;
+    LOG(FATAL) << "Unrecognized compression type :" << options.compression_type;
   }
 }
 
 // Read n+4 bytes from file, verify that checksum of first n bytes is
 // stored in the last 4 bytes and store the first n bytes in *result.
-// May use *storage as backing store.
-Status RecordReader::ReadChecksummed(uint64 offset, size_t n,
-                                     StringPiece* result, string* storage) {
+//
+// offset corresponds to the user-provided value to ReadRecord()
+// and is used only in error messages.
+Status RecordReader::ReadChecksummed(uint64 offset, size_t n, string* result) {
   if (n >= SIZE_MAX - sizeof(uint32)) {
     return errors::DataLoss("record size too large");
   }
 
   const size_t expected = n + sizeof(uint32);
-  storage->resize(expected);
-
-#if !defined(IS_SLIM_BUILD)
-  if (zlib_input_stream_) {
-    // If we have a zlib compressed buffer, we assume that the
-    // file is being read sequentially, and we use the underlying
-    // implementation to read the data.
-    //
-    // No checks are done to validate that the file is being read
-    // sequentially.  At some point the zlib input buffer may support
-    // seeking, possibly inefficiently.
-    TF_RETURN_IF_ERROR(zlib_input_stream_->ReadNBytes(expected, storage));
-
-    if (storage->size() != expected) {
-      if (storage->empty()) {
-        return errors::OutOfRange("eof");
-      } else {
-        return errors::DataLoss("truncated record at ", offset);
-      }
-    }
+  TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, result));
 
-    uint32 masked_crc = core::DecodeFixed32(storage->data() + n);
-    if (crc32c::Unmask(masked_crc) != crc32c::Value(storage->data(), n)) {
-      return errors::DataLoss("corrupted record at ", offset);
-    }
-    *result = StringPiece(storage->data(), n);
-  } else {
-#endif  // IS_SLIM_BUILD
-    if (options_.buffer_size > 0) {
-      // If we have a buffer, we assume that the file is being read
-      // sequentially, and we use the underlying implementation to read the
-      // data.
-      //
-      // No checks are done to validate that the file is being read
-      // sequentially.
-      TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, storage));
-
-      if (storage->size() != expected) {
-        if (storage->empty()) {
-          return errors::OutOfRange("eof");
-        } else {
-          return errors::DataLoss("truncated record at ", offset);
-        }
-      }
-
-      const uint32 masked_crc = core::DecodeFixed32(storage->data() + n);
-      if (crc32c::Unmask(masked_crc) != crc32c::Value(storage->data(), n)) {
-        return errors::DataLoss("corrupted record at ", offset);
-      }
-      *result = StringPiece(storage->data(), n);
+  if (result->size() != expected) {
+    if (result->empty()) {
+      return errors::OutOfRange("eof");
     } else {
-      // This version supports reading from arbitrary offsets
-      // since we are accessing the random access file directly.
-      StringPiece data;
-      TF_RETURN_IF_ERROR(src_->Read(offset, expected, &data, &(*storage)[0]));
-      if (data.size() != expected) {
-        if (data.empty()) {
-          return errors::OutOfRange("eof");
-        } else {
-          return errors::DataLoss("truncated record at ", offset);
-        }
-      }
-      const uint32 masked_crc = core::DecodeFixed32(data.data() + n);
-      if (crc32c::Unmask(masked_crc) != crc32c::Value(data.data(), n)) {
-        return errors::DataLoss("corrupted record at ", offset);
-      }
-      *result = StringPiece(data.data(), n);
+      return errors::DataLoss("truncated record at ", offset);
     }
-#if !defined(IS_SLIM_BUILD)
   }
-#endif  // IS_SLIM_BUILD
 
+  const uint32 masked_crc = core::DecodeFixed32(result->data() + n);
+  if (crc32c::Unmask(masked_crc) != crc32c::Value(result->data(), n)) {
+    return errors::DataLoss("corrupted record at ", offset);
+  }
+  result->resize(n);
   return Status::OK();
 }
 
@@ -167,50 +112,42 @@ Status RecordReader::ReadRecord(uint64* offset, string* record) {
   static const size_t kHeaderSize = sizeof(uint64) + sizeof(uint32);
   static const size_t kFooterSize = sizeof(uint32);
 
+  // Position the input stream.
+  int64 curr_pos = input_stream_->Tell();
+  int64 desired_pos = static_cast<int64>(*offset);
+  if (curr_pos > desired_pos || curr_pos < 0 /* EOF */ ||
+      (curr_pos == desired_pos && last_read_failed_)) {
+    last_read_failed_ = false;
+    TF_RETURN_IF_ERROR(input_stream_->Reset());
+    TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos));
+  } else if (curr_pos < desired_pos) {
+    TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos - curr_pos));
+  }
+  DCHECK_EQ(desired_pos, input_stream_->Tell());
+
   // Read header data.
-  StringPiece lbuf;
-  Status s = ReadChecksummed(*offset, sizeof(uint64), &lbuf, record);
+  Status s = ReadChecksummed(*offset, sizeof(uint64), record);
   if (!s.ok()) {
+    last_read_failed_ = true;
     return s;
   }
-  const uint64 length = core::DecodeFixed64(lbuf.data());
+  const uint64 length = core::DecodeFixed64(record->data());
 
   // Read data
-  StringPiece data;
-  s = ReadChecksummed(*offset + kHeaderSize, length, &data, record);
+  s = ReadChecksummed(*offset + kHeaderSize, length, record);
   if (!s.ok()) {
+    last_read_failed_ = true;
     if (errors::IsOutOfRange(s)) {
       s = errors::DataLoss("truncated record at ", *offset);
     }
     return s;
   }
 
-  if (record->data() != data.data()) {
-    // RandomAccessFile placed the data in some other location.
-    memmove(&(*record)[0], data.data(), data.size());
-  }
-
-  record->resize(data.size());
-
   *offset += kHeaderSize + length + kFooterSize;
+  DCHECK_EQ(*offset, input_stream_->Tell());
   return Status::OK();
 }
 
-Status RecordReader::SkipNBytes(uint64 offset) {
-#if !defined(IS_SLIM_BUILD)
-  if (zlib_input_stream_) {
-    TF_RETURN_IF_ERROR(zlib_input_stream_->SkipNBytes(offset));
-  } else {
-#endif
-    if (options_.buffer_size > 0) {
-      TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(offset));
-    }
-#if !defined(IS_SLIM_BUILD)
-  }
-#endif
-  return Status::OK();
-}  // namespace io
-
 SequentialRecordReader::SequentialRecordReader(
     RandomAccessFile* file, const RecordReaderOptions& options)
     : underlying_(file, options), offset_(0) {}
diff --git a/tensorflow/core/lib/io/record_reader.h b/tensorflow/core/lib/io/record_reader.h
index 26278e0328..f6d587dfa0 100644
--- a/tensorflow/core/lib/io/record_reader.h
+++ b/tensorflow/core/lib/io/record_reader.h
@@ -69,25 +69,14 @@ class RecordReader {
   // Read the record at "*offset" into *record and update *offset to
   // point to the offset of the next record.  Returns OK on success,
   // OUT_OF_RANGE for end of file, or something else for an error.
-  //
-  // Note: if buffering is used (with or without compression), access must be
-  // sequential.
   Status ReadRecord(uint64* offset, string* record);
 
-  // Skip the records till "offset". Returns OK on success,
-  // OUT_OF_RANGE for end of file, or something else for an error.
-  Status SkipNBytes(uint64 offset);
-
  private:
-  Status ReadChecksummed(uint64 offset, size_t n, StringPiece* result,
-                         string* storage);
+  Status ReadChecksummed(uint64 offset, size_t n, string* result);
 
-  RandomAccessFile* src_;
   RecordReaderOptions options_;
   std::unique_ptr<InputStreamInterface> input_stream_;
-#if !defined(IS_SLIM_BUILD)
-  std::unique_ptr<ZlibInputStream> zlib_input_stream_;
-#endif  // IS_SLIM_BUILD
+  bool last_read_failed_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(RecordReader);
 };
@@ -121,7 +110,6 @@ class SequentialRecordReader {
       return errors::InvalidArgument(
           "Trying to seek offset: ", offset,
           " which is less than the current offset: ", offset_);
-    TF_RETURN_IF_ERROR(underlying_.SkipNBytes(offset - offset_));
     offset_ = offset;
     return Status::OK();
   }
diff --git a/tensorflow/core/lib/io/recordio_test.cc b/tensorflow/core/lib/io/recordio_test.cc
index 63235761d9..da514bd21c 100644
--- a/tensorflow/core/lib/io/recordio_test.cc
+++ b/tensorflow/core/lib/io/recordio_test.cc
@@ -26,10 +26,11 @@ limitations under the License.
 
 namespace tensorflow {
 namespace io {
+namespace {
 
 // Construct a string of the specified length made out of the supplied
 // partial string.
-static string BigString(const string& partial_string, size_t n) {
+string BigString(const string& partial_string, size_t n) {
   string result;
   while (result.size() < n) {
     result.append(partial_string);
@@ -39,62 +40,66 @@ static string BigString(const string& partial_string, size_t n) {
 }
 
 // Construct a string from a number
-static string NumberString(int n) {
+string NumberString(int n) {
   char buf[50];
   snprintf(buf, sizeof(buf), "%d.", n);
   return string(buf);
 }
 
 // Return a skewed potentially long string
-static string RandomSkewedString(int i, random::SimplePhilox* rnd) {
+string RandomSkewedString(int i, random::SimplePhilox* rnd) {
   return BigString(NumberString(i), rnd->Skewed(17));
 }
 
-class RecordioTest : public ::testing::Test {
+class StringDest : public WritableFile {
+ public:
+  explicit StringDest(string* contents) : contents_(contents) {}
+
+  Status Close() override { return Status::OK(); }
+  Status Flush() override { return Status::OK(); }
+  Status Sync() override { return Status::OK(); }
+  Status Append(const StringPiece& slice) override {
+    contents_->append(slice.data(), slice.size());
+    return Status::OK();
+  }
+
  private:
-  class StringDest : public WritableFile {
-   public:
-    string contents_;
-
-    Status Close() override { return Status::OK(); }
-    Status Flush() override { return Status::OK(); }
-    Status Sync() override { return Status::OK(); }
-    Status Append(const StringPiece& slice) override {
-      contents_.append(slice.data(), slice.size());
-      return Status::OK();
+  string* contents_;
+};
+
+class StringSource : public RandomAccessFile {
+ public:
+  explicit StringSource(string* contents)
+      : contents_(contents), force_error_(false) {}
+
+  Status Read(uint64 offset, size_t n, StringPiece* result,
+              char* scratch) const override {
+    if (force_error_) {
+      force_error_ = false;
+      return errors::DataLoss("read error");
     }
-  };
-
-  class StringSource : public RandomAccessFile {
-   public:
-    StringPiece contents_;
-    mutable bool force_error_;
-    mutable bool returned_partial_;
-    StringSource() : force_error_(false), returned_partial_(false) {}
-
-    Status Read(uint64 offset, size_t n, StringPiece* result,
-                char* scratch) const override {
-      EXPECT_FALSE(returned_partial_) << "must not Read() after eof/error";
-
-      if (force_error_) {
-        force_error_ = false;
-        returned_partial_ = true;
-        return errors::DataLoss("read error");
-      }
-
-      if (offset >= contents_.size()) {
-        return errors::OutOfRange("end of file");
-      }
-
-      if (contents_.size() < offset + n) {
-        n = contents_.size() - offset;
-        returned_partial_ = true;
-      }
-      *result = StringPiece(contents_.data() + offset, n);
-      return Status::OK();
+
+    if (offset >= contents_->size()) {
+      return errors::OutOfRange("end of file");
+    }
+
+    if (contents_->size() < offset + n) {
+      n = contents_->size() - offset;
     }
-  };
+    *result = StringPiece(contents_->data() + offset, n);
+    return Status::OK();
+  }
+
+  void force_error() { force_error_ = true; }
+
+ private:
+  string* contents_;
+  mutable bool force_error_;
+};
 
+class RecordioTest : public ::testing::Test {
+ private:
+  string contents_;
   StringDest dest_;
   StringSource source_;
   bool reading_;
@@ -104,7 +109,9 @@ class RecordioTest : public ::testing::Test {
 
  public:
   RecordioTest()
-      : reading_(false),
+      : dest_(&contents_),
+        source_(&contents_),
+        reading_(false),
         readpos_(0),
         writer_(new RecordWriter(&dest_)),
         reader_(new RecordReader(&source_)) {}
@@ -119,12 +126,11 @@ class RecordioTest : public ::testing::Test {
     TF_ASSERT_OK(writer_->WriteRecord(StringPiece(msg)));
   }
 
-  size_t WrittenBytes() const { return dest_.contents_.size(); }
+  size_t WrittenBytes() const { return contents_.size(); }
 
   string Read() {
     if (!reading_) {
       reading_ = true;
-      source_.contents_ = StringPiece(dest_.contents_);
     }
     string record;
     Status s = reader_->ReadRecord(&readpos_, &record);
@@ -137,26 +143,20 @@ class RecordioTest : public ::testing::Test {
     }
   }
 
-  void IncrementByte(int offset, int delta) {
-    dest_.contents_[offset] += delta;
-  }
+  void IncrementByte(int offset, int delta) { contents_[offset] += delta; }
 
-  void SetByte(int offset, char new_byte) {
-    dest_.contents_[offset] = new_byte;
-  }
+  void SetByte(int offset, char new_byte) { contents_[offset] = new_byte; }
 
-  void ShrinkSize(int bytes) {
-    dest_.contents_.resize(dest_.contents_.size() - bytes);
-  }
+  void ShrinkSize(int bytes) { contents_.resize(contents_.size() - bytes); }
 
   void FixChecksum(int header_offset, int len) {
     // Compute crc of type/len/data
-    uint32_t crc = crc32c::Value(&dest_.contents_[header_offset + 6], 1 + len);
+    uint32_t crc = crc32c::Value(&contents_[header_offset + 6], 1 + len);
     crc = crc32c::Mask(crc);
-    core::EncodeFixed32(&dest_.contents_[header_offset], crc);
+    core::EncodeFixed32(&contents_[header_offset], crc);
   }
 
-  void ForceError() { source_.force_error_ = true; }
+  void ForceError() { source_.force_error(); }
 
   void StartReadingAt(uint64_t initial_offset) { readpos_ = initial_offset; }
 
@@ -165,7 +165,6 @@ class RecordioTest : public ::testing::Test {
     Write("bar");
     Write(BigString("x", 10000));
     reading_ = true;
-    source_.contents_ = StringPiece(dest_.contents_);
     uint64 offset = WrittenBytes() + offset_past_end;
     string record;
     Status s = reader_->ReadRecord(&offset, &record);
@@ -217,16 +216,100 @@ TEST_F(RecordioTest, RandomRead) {
   ASSERT_EQ("EOF", Read());
 }
 
+void TestNonSequentialReads(const RecordWriterOptions& writer_options,
+                            const RecordReaderOptions& reader_options) {
+  string contents;
+  StringDest dst(&contents);
+  RecordWriter writer(&dst, writer_options);
+  for (int i = 0; i < 10; ++i) {
+    TF_ASSERT_OK(writer.WriteRecord(NumberString(i))) << i;
+  }
+  TF_ASSERT_OK(writer.Close());
+
+  StringSource file(&contents);
+  RecordReader reader(&file, reader_options);
+
+  string record;
+  // First read sequentially to fill in the offsets table.
+  uint64 offsets[10] = {0};
+  uint64 offset = 0;
+  for (int i = 0; i < 10; ++i) {
+    offsets[i] = offset;
+    TF_ASSERT_OK(reader.ReadRecord(&offset, &record)) << i;
+  }
+
+  // Read randomly: First go back to record #3 then forward to #8.
+  offset = offsets[3];
+  TF_ASSERT_OK(reader.ReadRecord(&offset, &record));
+  EXPECT_EQ("3.", record);
+  EXPECT_EQ(offsets[4], offset);
+
+  offset = offsets[8];
+  TF_ASSERT_OK(reader.ReadRecord(&offset, &record));
+  EXPECT_EQ("8.", record);
+  EXPECT_EQ(offsets[9], offset);
+}
+
+TEST_F(RecordioTest, NonSequentialReads) {
+  TestNonSequentialReads(RecordWriterOptions(), RecordReaderOptions());
+}
+
+TEST_F(RecordioTest, NonSequentialReadsWithReadBuffer) {
+  RecordReaderOptions options;
+  options.buffer_size = 1 << 10;
+  TestNonSequentialReads(RecordWriterOptions(), options);
+}
+
+TEST_F(RecordioTest, NonSequentialReadsWithCompression) {
+  TestNonSequentialReads(
+      RecordWriterOptions::CreateRecordWriterOptions("ZLIB"),
+      RecordReaderOptions::CreateRecordReaderOptions("ZLIB"));
+}
+
 // Tests of all the error paths in log_reader.cc follow:
-static void AssertHasSubstr(StringPiece s, StringPiece expected) {
+void AssertHasSubstr(StringPiece s, StringPiece expected) {
   EXPECT_TRUE(str_util::StrContains(s, expected))
       << s << " does not contain " << expected;
 }
 
+void TestReadError(const RecordWriterOptions& writer_options,
+                   const RecordReaderOptions& reader_options) {
+  const string wrote = BigString("well hello there!", 100);
+  string contents;
+  StringDest dst(&contents);
+  TF_ASSERT_OK(RecordWriter(&dst, writer_options).WriteRecord(wrote));
+
+  StringSource file(&contents);
+  RecordReader reader(&file, reader_options);
+
+  uint64 offset = 0;
+  string read;
+  file.force_error();
+  Status status = reader.ReadRecord(&offset, &read);
+  ASSERT_TRUE(errors::IsDataLoss(status));
+  ASSERT_EQ(0, offset);
+
+  // A failed Read() shouldn't update the offset, and thus a retry shouldn't
+  // lose the record.
+  status = reader.ReadRecord(&offset, &read);
+  ASSERT_TRUE(status.ok()) << status;
+  EXPECT_GT(offset, 0);
+  EXPECT_EQ(wrote, read);
+}
+
 TEST_F(RecordioTest, ReadError) {
-  Write("foo");
-  ForceError();
-  AssertHasSubstr(Read(), "Data loss");
+  TestReadError(RecordWriterOptions(), RecordReaderOptions());
+}
+
+TEST_F(RecordioTest, ReadErrorWithBuffering) {
+  RecordReaderOptions options;
+  options.buffer_size = 1 << 20;
+  TestReadError(RecordWriterOptions(), options);
+}
+
+TEST_F(RecordioTest, ReadErrorWithCompression) {
+  TestReadError(RecordWriterOptions::CreateRecordWriterOptions("ZLIB"),
+                RecordReaderOptions::CreateRecordReaderOptions("ZLIB"));
 }
 
 TEST_F(RecordioTest, CorruptLength) {
@@ -257,5 +340,6 @@ TEST_F(RecordioTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); }
 
 TEST_F(RecordioTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); }
 
+}  // namespace
 }  // namespace io
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/zlib_inputstream.cc b/tensorflow/core/lib/io/zlib_inputstream.cc
index 984fbc2810..bf8dcf0988 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.cc
+++ b/tensorflow/core/lib/io/zlib_inputstream.cc
@@ -25,8 +25,9 @@ ZlibInputStream::ZlibInputStream(
     InputStreamInterface* input_stream,
     size_t input_buffer_bytes,   // size of z_stream.next_in buffer
     size_t output_buffer_bytes,  // size of z_stream.next_out buffer
-    const ZlibCompressionOptions& zlib_options)
-    : input_stream_(input_stream),
+    const ZlibCompressionOptions& zlib_options, bool owns_input_stream)
+    : owns_input_stream_(owns_input_stream),
+      input_stream_(input_stream),
       input_buffer_capacity_(input_buffer_bytes),
       output_buffer_capacity_(output_buffer_bytes),
       z_stream_input_(new Bytef[input_buffer_capacity_]),
@@ -41,10 +42,14 @@ ZlibInputStream::~ZlibInputStream() {
   if (z_stream_) {
     inflateEnd(z_stream_.get());
   }
+  if (owns_input_stream_) {
+    delete input_stream_;
+  }
 }
 
 Status ZlibInputStream::Reset() {
   TF_RETURN_IF_ERROR(input_stream_->Reset());
+  inflateEnd(z_stream_.get());
   InitZlibBuffer();
   bytes_read_ = 0;
   return Status::OK();
diff --git a/tensorflow/core/lib/io/zlib_inputstream.h b/tensorflow/core/lib/io/zlib_inputstream.h
index 9c7e14441c..6099e2455d 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.h
+++ b/tensorflow/core/lib/io/zlib_inputstream.h
@@ -40,10 +40,13 @@ class ZlibInputStream : public InputStreamInterface {
   // Create a ZlibInputStream for `input_stream` with a buffer of size
   // `input_buffer_bytes` bytes for reading contents from `input_stream` and
   // another buffer with size `output_buffer_bytes` for caching decompressed
-  // contents. Does *not* take ownership of "input_stream".
+  // contents.
+  //
+  // Takes ownership of `input_stream` iff `owns_input_stream` is true.
   ZlibInputStream(InputStreamInterface* input_stream, size_t input_buffer_bytes,
                   size_t output_buffer_bytes,
-                  const ZlibCompressionOptions& zlib_options);
+                  const ZlibCompressionOptions& zlib_options,
+                  bool owns_input_stream = false);
 
   ~ZlibInputStream();
 
@@ -65,7 +68,8 @@ class ZlibInputStream : public InputStreamInterface {
  private:
   void InitZlibBuffer();
 
-  InputStreamInterface* input_stream_;  // Not owned
+  const bool owns_input_stream_;
+  InputStreamInterface* input_stream_;
   size_t input_buffer_capacity_;        // Size of z_stream_input_
   size_t output_buffer_capacity_;       // Size of z_stream_output_
   char* next_unread_byte_;              // Next unread byte in z_stream_output_
-- 
GitLab


From a4945fc86cabcf3d5f0b9eaac21bb7c1d1146d57 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Thu, 19 Apr 2018 14:30:27 -0700
Subject: [PATCH 174/450] The HLO element type converter must remove side
 effecting instructions like Rng

The CPU backend does not know how to lower bf16 typed RNG nodes so even unused
instances of these can't remain in the HLO IR.
HloComputation::ReplaceInstruction keeps these Rng nodes around since it doesn't
remove side effecting nodes.
PiperOrigin-RevId: 193575183
---
 .../xla/service/hlo_element_type_converter.cc | 15 ++++-
 .../hlo_element_type_converter_test.cc        | 66 +++++++++++++++++++
 .../compiler/xla/service/hlo_instruction.cc   | 37 ++++++++---
 .../compiler/xla/service/hlo_instruction.h    | 28 +++++---
 tensorflow/compiler/xla/util.h                | 10 +++
 5 files changed, 139 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
index c782d1b0ad..d236f83aeb 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
@@ -178,24 +178,37 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
       if (hlo->shape().element_type() == eliminate_type_) {
         Shape shape =
             ShapeUtil::ChangeElementType(hlo->shape(), replace_with_type_);
+
         new_hlo = computation->AddInstruction(
             hlo->CloneWithNewOperands(shape, new_operands, hlo->GetModule()));
+        TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo));
+
         new_hlo = ToElementType(new_hlo, eliminate_type_);
       } else if (ShapeUtil::IsTuple(hlo->shape())) {
         Shape old_shape = hlo->shape();
         Shape new_shape = GetConvertedTupleShape(hlo->shape(), eliminate_type_,
                                                  replace_with_type_);
+
         new_hlo = computation->AddInstruction(hlo->CloneWithNewOperands(
             new_shape, new_operands, hlo->GetModule()));
+        TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo));
+
         // Convert the elements of the result of `new_hlo` to produce a new
         // tuple with shape `old_shape`.
         new_hlo = ConvertTupleElements(new_hlo, old_shape);
       } else {
         new_hlo = computation->AddInstruction(hlo->CloneWithNewOperands(
             hlo->shape(), new_operands, hlo->GetModule()));
+        TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo));
       }
 
-      TF_RETURN_IF_ERROR(computation->ReplaceInstruction(hlo, new_hlo));
+      TF_RETURN_IF_ERROR(hlo->ReplaceAllUsesWith(new_hlo));
+      TF_RETURN_IF_ERROR(hlo->DropAllControlDeps());
+
+      // NB!  We want to replace and remove side effecting instructions like Rng
+      // as well so we can't rely HloComputation::ReplaceInstruction to reliably
+      // remove the replaced instruction.
+      TF_RETURN_IF_ERROR(computation->RemoveInstruction(hlo));
       changed = true;
     }
   }
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
index cb94d9f19b..5c5a059e0f 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
@@ -22,6 +22,12 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
+using ::testing::Contains;
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::Not;
+using ::testing::ResultOf;
+
 class HloElementTypeConverterTest : public HloTestBase {
  public:
   std::unique_ptr<HloModule> CreateModuleFromHloString(
@@ -117,5 +123,65 @@ TEST_F(HloElementTypeConverterTest, BatchNormGradBF16Converted) {
                         op::Convert(op::GetTupleElement(batch_norm, 2))));
 }
 
+TEST_F(HloElementTypeConverterTest, RngIsRemoved) {
+  const string& hlo_string = R"(
+HloModule RngIsRemoved
+
+ENTRY main {
+  constant.3 = bf16[] constant(0)
+  constant.4 = bf16[] constant(1)
+  ROOT rng = bf16[1,1000,20]{2,1,0} rng(constant.3, constant.4), distribution=rng_uniform
+}
+  )";
+  auto module = CreateModuleFromHloString(hlo_string);
+  HloElementTypeConverter type_converter(BF16, F32);
+  TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
+  EXPECT_TRUE(converted);
+
+  std::function<bool(const HloInstruction*)> is_bf16_rng =
+      [](const HloInstruction* inst) {
+        return inst->shape().element_type() == BF16 &&
+               inst->opcode() == HloOpcode::kRng;
+      };
+
+  EXPECT_THAT(module->entry_computation()->instructions(),
+              Not(Contains(ResultOf(is_bf16_rng, Eq(true)))));
+}
+
+TEST_F(HloElementTypeConverterTest, RngCtrlDep) {
+  const string& hlo_string = R"(
+HloModule RngIsRemoved
+
+ENTRY main {
+  constant.3 = bf16[] constant(0)
+  constant.4 = bf16[] constant(1)
+  rng0 = bf16[1,2000,20]{2,1,0} rng(constant.3, constant.4), distribution=rng_uniform
+  ROOT rng1 = bf16[1,1000,20]{2,1,0} rng(constant.3, constant.4), control-predecessors={%rng0}, distribution=rng_uniform
+}
+  )";
+  auto module = CreateModuleFromHloString(hlo_string);
+
+  HloElementTypeConverter type_converter(BF16, F32);
+  TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
+  EXPECT_TRUE(converted);
+
+  HloInstruction *rng0, *rng1;
+  for (auto* inst : module->entry_computation()->instructions()) {
+    if (inst->opcode() == HloOpcode::kRng) {
+      const Shape& shape = inst->shape();
+      ASSERT_EQ(shape.dimensions_size(), 3);
+      ASSERT_TRUE(shape.dimensions(1) == 2000 || shape.dimensions(1) == 1000);
+      if (shape.dimensions(1) == 2000) {
+        rng0 = inst;
+      } else {
+        rng1 = inst;
+      }
+    }
+  }
+
+  EXPECT_THAT(rng0->control_successors(), ElementsAre(rng1));
+  EXPECT_THAT(rng1->control_predecessors(), ElementsAre(rng0));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 6303bcc59f..a638d54d85 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -1678,14 +1678,35 @@ Status HloInstruction::AddControlDependencyTo(HloInstruction* instruction) {
 }
 
 Status HloInstruction::RemoveControlDependencyTo(HloInstruction* instruction) {
-  auto succ_it = std::find(control_successors_.begin(),
-                           control_successors_.end(), instruction);
-  TF_RET_CHECK(succ_it != control_successors_.end());
-  control_successors_.erase(succ_it);
-  auto pred_it = std::find(instruction->control_predecessors_.begin(),
-                           instruction->control_predecessors_.end(), this);
-  TF_RET_CHECK(pred_it != instruction->control_predecessors_.end());
-  instruction->control_predecessors_.erase(pred_it);
+  TF_RET_CHECK(instruction->parent() == parent());
+  TF_RETURN_IF_ERROR(EraseElementFromVector(&control_successors_, instruction));
+  TF_RETURN_IF_ERROR(
+      EraseElementFromVector(&instruction->control_predecessors_, this));
+  return Status::OK();
+}
+
+Status HloInstruction::DropAllControlDeps() {
+  for (auto* ctrl_succ : control_successors_) {
+    TF_RETURN_IF_ERROR(
+        EraseElementFromVector(&ctrl_succ->control_predecessors_, this));
+  }
+  for (auto* ctrl_pred : control_predecessors_) {
+    TF_RETURN_IF_ERROR(
+        EraseElementFromVector(&ctrl_pred->control_successors_, this));
+  }
+  control_successors_.clear();
+  control_predecessors_.clear();
+  return Status::OK();
+}
+
+Status HloInstruction::CopyAllControlDepsFrom(const HloInstruction* inst) {
+  for (auto* ctrl_pred : inst->control_predecessors()) {
+    TF_RETURN_IF_ERROR(ctrl_pred->AddControlDependencyTo(this));
+  }
+
+  for (auto* ctrl_succ : inst->control_successors()) {
+    TF_RETURN_IF_ERROR(this->AddControlDependencyTo(ctrl_succ));
+  }
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 5a7394f7a6..a5e9aecb9e 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -557,6 +557,18 @@ class HloInstruction {
   // 'instruction'.
   Status RemoveControlDependencyTo(HloInstruction* instruction);
 
+  // Drops all control predecessors and successors from this HLO instruction.
+  Status DropAllControlDeps();
+
+  // Copies the control predecessors and successors on this HLO instruction to
+  // `inst`.  Does not do a deep copy so this makes sense only if `inst` and
+  // this HLO are in the same module.
+  //
+  // Depending on the use cases we see in practice, in the future we may
+  // consider folding the logic here into Clone, CloneWithNewOperands and
+  // ReplaceAllUsesWith by treating control dependencies like data dependencies.
+  Status CopyAllControlDepsFrom(const HloInstruction* inst);
+
   // Returns the set of control predecessors (successors) of this
   // instruction. Control predecessors (successors) must execute before (after)
   // the current instruction.
@@ -1148,17 +1160,17 @@ class HloInstruction {
   // Clones the HLO instruction. The clone will have the same opcode, shape, and
   // operands. After creation the clone has no uses. "this" (the instruction
   // cloned from) is not changed. Suffix is the string to append to the name of
-  // the instruction to form the name of the cloned instruction.
-  // If the module pointer is not nullptr, it will be the module where
-  // the cloned computations will be added to (in order to support deep
-  // cloning).
+  // the instruction to form the name of the cloned instruction.  If the module
+  // pointer is not nullptr, it will be the module where the cloned computations
+  // will be added to (in order to support deep cloning).  Ignores the control
+  // predecessors and successors of this HLO instruction.
   std::unique_ptr<HloInstruction> Clone(const string& suffix = "clone",
                                         HloModule* module = nullptr) const;
 
-  // Clones the HLO instruction as above but with new shape and operands.
-  // If the module pointer is not nullptr, it will be the module where
-  // the cloned computations will be added to (in order to support deep
-  // cloning).
+  // Clones the HLO instruction as above but with new shape and operands.  If
+  // the module pointer is not nullptr, it will be the module where the cloned
+  // computations will be added to (in order to support deep cloning).  Ignores
+  // the control predecessors and successors of this HLO instruction.
   std::unique_ptr<HloInstruction> CloneWithNewOperands(
       const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
       HloModule* module = nullptr) const;
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 2da9f9ed6f..be33bd6dd1 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -528,6 +528,16 @@ bool IsInt32(T x) {
   // value is implementation-defined."
   return static_cast<int32>(x) == x;
 }
+
+template <typename T>
+Status EraseElementFromVector(std::vector<T>* container, const T& value) {
+  // c_find returns a const_iterator which does not seem to work on gcc 4.8.4,
+  // and this breaks the ubuntu/xla_gpu build bot.
+  auto it = std::find(container->begin(), container->end(), value);
+  TF_RET_CHECK(it != container->end());
+  container->erase(it);
+  return Status::OK();
+}
 }  // namespace xla
 
 #define XLA_LOG_LINES(SEV, STRING) \
-- 
GitLab


From 1aa032b94f630845abf6c3dce8d6623ae9e35b0f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 14:35:27 -0700
Subject: [PATCH 175/450] Replaced calls to deprecated tensorflow::StringPiece
 methods with their tensorflow::str_util equivalents.

This will allow the deprecated methods to be removed.

PiperOrigin-RevId: 193575992
---
 tensorflow/core/platform/test_main.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/platform/test_main.cc b/tensorflow/core/platform/test_main.cc
index 677114f5f2..e57bbd80af 100644
--- a/tensorflow/core/platform/test_main.cc
+++ b/tensorflow/core/platform/test_main.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 #include <iostream>
 
-#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/stacktrace_handler.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -37,7 +37,7 @@ GTEST_API_ int main(int argc, char** argv) {
   tensorflow::testing::InstallStacktraceHandler();
   testing::InitGoogleTest(&argc, argv);
   for (int i = 1; i < argc; i++) {
-    if (tensorflow::StringPiece(argv[i]).starts_with("--benchmarks=")) {
+    if (tensorflow::str_util::StartsWith(argv[i], "--benchmarks=")) {
       const char* pattern = argv[i] + strlen("--benchmarks=");
       tensorflow::testing::Benchmark::Run(pattern);
       return 0;
-- 
GitLab


From 470842748b9ee219fa0fcb8e3de25720960c83e3 Mon Sep 17 00:00:00 2001
From: Olivia Nordquist <nolivia@google.com>
Date: Thu, 19 Apr 2018 14:59:25 -0700
Subject: [PATCH 176/450] disabling opensource testing for failing xla test

PiperOrigin-RevId: 193579805
---
 tensorflow/compiler/xla/python/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 0517a5502e..0b9333b406 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -8,6 +8,7 @@ py_library(
     name = "xla_client",
     srcs = ["xla_client.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_oss"],
     visibility = ["//visibility:public"],
     deps = [
         ":pywrap_xla",
-- 
GitLab


From 2d0a7087a14f015ea49f4b8feb70e0b5ecd41b28 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 15:09:58 -0700
Subject: [PATCH 177/450] Only generate floating points that are fractions like
 n / 256, since they are RGB pixels. This fixes RGBToHSVTest.testBatch on
 low-precision dtypes like bfloat16.

PiperOrigin-RevId: 193581652
---
 tensorflow/compiler/tests/image_ops_test.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index 5b19e993ec..42e637734c 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -34,20 +34,23 @@ from tensorflow.python.ops import image_ops
 from tensorflow.python.platform import test
 
 
+def GenerateNumpyRandomRGB(shape):
+  # Only generate floating points that are fractions like n / 256, since they
+  # are RGB pixels. Some low-precision floating point types in this test can't
+  # handle arbitrary precision floating points well.
+  return np.random.randint(0, 256, shape) / 256.
+
+
 class RGBToHSVTest(XLATestCase):
 
   def testBatch(self):
-    # TODO(b/78230407): Reenable the test on GPU.
-    if self.device == "XLA_GPU":
-      return
-
     # Build an arbitrary RGB image
     np.random.seed(7)
     batch_size = 5
     shape = (batch_size, 2, 7, 3)
 
     for nptype in self.float_types:
-      inp = np.random.rand(*shape).astype(nptype)
+      inp = GenerateNumpyRandomRGB(shape).astype(nptype)
 
       # Convert to HSV and back, as a batch and individually
       with self.test_session() as sess:
@@ -87,7 +90,7 @@ class RGBToHSVTest(XLATestCase):
   def testRGBToHSVNumpy(self):
     """Tests the RGB to HSV conversion matches a reference implementation."""
     for nptype in self.float_types:
-      rgb_flat = np.random.random(64 * 3).reshape((64, 3)).astype(nptype)
+      rgb_flat = GenerateNumpyRandomRGB((64, 3)).astype(nptype)
       rgb_np = rgb_flat.reshape(4, 4, 4, 3)
       hsv_np = np.array([
           colorsys.rgb_to_hsv(
-- 
GitLab


From 38c0d7e1c0ee0617cf73ccf6809bd55d70089233 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 15:27:19 -0700
Subject: [PATCH 178/450] Convert a local variable and mutex to a struct so
 GUARDED_BY annotation works correctly.

PiperOrigin-RevId: 193584438
---
 tensorflow/core/kernels/sdca_ops.cc | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/kernels/sdca_ops.cc b/tensorflow/core/kernels/sdca_ops.cc
index 55e68b348b..05c835ebc4 100644
--- a/tensorflow/core/kernels/sdca_ops.cc
+++ b/tensorflow/core/kernels/sdca_ops.cc
@@ -156,8 +156,10 @@ void DoCompute(const ComputeOptions& options, OpKernelContext* const context) {
   } else {
     examples.RandomShuffle();
   }
-  mutex mu;
-  Status train_step_status GUARDED_BY(mu);
+  struct {
+    mutex mu;
+    Status value GUARDED_BY(mu);
+  } train_step_status;
   std::atomic<std::int64_t> atomic_index(-1);
   auto train_step = [&](const int64 begin, const int64 end) {
     // The static_cast here is safe since begin and end can be at most
@@ -171,8 +173,8 @@ void DoCompute(const ComputeOptions& options, OpKernelContext* const context) {
       const Status conversion_status =
           options.loss_updater->ConvertLabel(&example_label);
       if (!conversion_status.ok()) {
-        mutex_lock l(mu);
-        train_step_status = conversion_status;
+        mutex_lock l(train_step_status.mu);
+        train_step_status.value = conversion_status;
         // Return from this worker thread - the calling thread is
         // responsible for checking context status and returning on error.
         return;
@@ -217,7 +219,8 @@ void DoCompute(const ComputeOptions& options, OpKernelContext* const context) {
 
   Shard(worker_threads.num_threads, worker_threads.workers,
         examples.num_examples(), kCostPerUnit, train_step);
-  OP_REQUIRES_OK(context, train_step_status);
+  mutex_lock l(train_step_status.mu);
+  OP_REQUIRES_OK(context, train_step_status.value);
 }
 
 }  // namespace
-- 
GitLab


From 4bcf49c4b22205fc829f89da96e37f366c9fa9e6 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 19 Apr 2018 15:29:21 -0700
Subject: [PATCH 179/450] Prevent a bool field from being accessed when
 uninitialized.

PiperOrigin-RevId: 193584746
---
 tensorflow/core/distributed_runtime/message_wrappers.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index 92c5668e3a..72a0c7edd8 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -353,7 +353,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
 
  private:
   string session_handle_;
-  bool create_worker_session_called_;
+  bool create_worker_session_called_ = false;
   string graph_handle_;
   int64 step_id_;
   ExecutorOpts exec_opts_;
-- 
GitLab


From 4868ddd508a567a497935378956e9da18976f152 Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Thu, 19 Apr 2018 15:32:37 -0700
Subject: [PATCH 180/450] Simplifying cols_to_vars update

PiperOrigin-RevId: 193585237
---
 tensorflow/python/feature_column/feature_column.py      | 6 ++----
 tensorflow/python/feature_column/feature_column_test.py | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 87a52f8441..a7c4eabcb2 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -417,10 +417,8 @@ def linear_model(features,
       trainable=trainable,
       name='linear_model')
   retval = linear_model_layer(features)  # pylint: disable=not-callable
-  if cols_to_vars is None:
-    return retval
-  for k, v in linear_model_layer.cols_to_vars().items():
-    cols_to_vars[k] = v
+  if cols_to_vars is not None:
+    cols_to_vars.update(linear_model_layer.cols_to_vars())
   return retval
 
 
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 49e06b8245..d963dd9b55 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -1269,10 +1269,8 @@ def get_keras_linear_model_predictions(features,
       trainable,
       name='linear_model')
   retval = keras_linear_model(features)  # pylint: disable=not-callable
-  if cols_to_vars is None:
-    return retval
-  for k, v in keras_linear_model.cols_to_vars().items():
-    cols_to_vars[k] = v
+  if cols_to_vars is not None:
+    cols_to_vars.update(keras_linear_model.cols_to_vars())
   return retval
 
 
-- 
GitLab


From d5c32f4ccc85ad0d13f3a1f83e063211504cf976 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 19 Apr 2018 15:55:53 -0700
Subject: [PATCH 181/450] Internal-only change.

PiperOrigin-RevId: 193588868
---
 tensorflow/contrib/data/python/kernel_tests/BUILD | 1 +
 tensorflow/contrib/estimator/BUILD                | 1 +
 tensorflow/contrib/learn/BUILD                    | 5 ++++-
 tensorflow/python/kernel_tests/BUILD              | 3 +++
 tensorflow/python/kernel_tests/linalg/BUILD       | 5 ++++-
 5 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 83daa04efc..05a4f5028a 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -216,6 +216,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "no_pip",
+        "noasan",  # times out
         "optonly",
     ],
     deps = [
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index 9e88bc7de1..62ddb3d290 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -447,6 +447,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "no_pip",
+        "noasan",  # times out
         "notsan",
     ],
     deps = [
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index d665fc9335..3b053cd4c6 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -281,7 +281,10 @@ py_test(
     size = "medium",
     srcs = ["python/learn/estimators/estimator_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "noasan",  # times out
+    ],
     deps = [
         ":learn",
         "//tensorflow/contrib/framework:framework_py",
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 9440f2a4f9..8628ca5d40 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1190,6 +1190,9 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
     ],
     shard_count = 10,
+    tags = [
+        "noasan",  # times out
+    ],
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index 4e3f24890b..7ffa48b653 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -123,7 +123,10 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
     shard_count = 5,
-    tags = ["optonly"],
+    tags = [
+        "noasan",  # times out
+        "optonly",
+    ],
 )
 
 cuda_py_test(
-- 
GitLab


From 9e5fdb83e609701457f6fdc2d153b1f7e83ead6c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 15:56:17 -0700
Subject: [PATCH 182/450] Automated g4 rollback of changelist 193564222

PiperOrigin-RevId: 193588935
---
 tensorflow/contrib/image/kernels/image_ops.cc |  7 +--
 tensorflow/contrib/image/kernels/image_ops.h  |  2 +-
 tensorflow/contrib/image/ops/image_ops.cc     | 52 ++-----------------
 .../python/kernel_tests/image_ops_test.py     | 30 -----------
 .../contrib/image/python/ops/image_ops.py     | 39 ++++++--------
 5 files changed, 23 insertions(+), 107 deletions(-)

diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc
index ae4b1ba62a..c2e32da133 100644
--- a/tensorflow/contrib/image/kernels/image_ops.cc
+++ b/tensorflow/contrib/image/kernels/image_ops.cc
@@ -70,7 +70,6 @@ class ImageProjectiveTransform : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor& images_t = ctx->input(0);
     const Tensor& transform_t = ctx->input(1);
-    const Tensor& output_dim = ctx->input(2);
     OP_REQUIRES(ctx, images_t.shape().dims() == 4,
                 errors::InvalidArgument("Input images must have rank 4"));
     OP_REQUIRES(ctx,
@@ -84,11 +83,7 @@ class ImageProjectiveTransform : public OpKernel {
     auto images = images_t.tensor<T, 4>();
     auto transform = transform_t.matrix<float>();
     Tensor* output_t;
-    // Image is NHWC format.
-    auto output_shape = images_t.shape();
-    output_shape.set_dim(1, output_dim.vec<int>()(0));
-    output_shape.set_dim(2, output_dim.vec<int>()(1));
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_t));
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, images_t.shape(), &output_t));
     auto output = output_t->tensor<T, 4>();
     (FillProjectiveTransform<Device, T>(interpolation_))(
         ctx->eigen_device<Device>(), &output, images, transform);
diff --git a/tensorflow/contrib/image/kernels/image_ops.h b/tensorflow/contrib/image/kernels/image_ops.h
index 2320329b92..ad50133061 100644
--- a/tensorflow/contrib/image/kernels/image_ops.h
+++ b/tensorflow/contrib/image/kernels/image_ops.h
@@ -161,7 +161,7 @@ struct FillProjectiveTransform {
   void operator()(const Device& device, OutputType* output,
                   const InputType& images,
                   const TransformsType& transform) const {
-    output->device(device) = output->generate(
+    output->device(device) = images.generate(
         ProjectiveGenerator<Device, T>(images, transform, interpolation_));
   }
 };
diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc
index 4c6d8c0d19..68771b3d05 100644
--- a/tensorflow/contrib/image/ops/image_ops.cc
+++ b/tensorflow/contrib/image/ops/image_ops.cc
@@ -19,55 +19,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
-namespace {
-
-// Sets output[0] to shape [batch_dim,height,width,channel_dim], where
-// height and width come from the size_tensor.
-Status SetOutputToSizedImage(InferenceContext* c, DimensionHandle batch_dim,
-                             int size_input_idx, DimensionHandle channel_dim) {
-  // Verify shape of size input.
-  ShapeHandle size;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(size_input_idx), 1, &size));
-  DimensionHandle unused;
-  TF_RETURN_IF_ERROR(c->WithValue(c->Dim(size, 0), 2, &unused));
-
-  // Get size values from the size tensor.
-  const Tensor* size_tensor = c->input_tensor(size_input_idx);
-  DimensionHandle width;
-  DimensionHandle height;
-  if (size_tensor == nullptr) {
-    width = c->UnknownDim();
-    height = c->UnknownDim();
-  } else {
-    // TODO(petewarden) - Remove once we have constant evaluation in C++ only.
-    if (size_tensor->dtype() != DT_INT32) {
-      return errors::InvalidArgument(
-          "Bad size input type for SetOutputToSizedImage: Expected DT_INT32 "
-          "but got ",
-          DataTypeString(size_tensor->dtype()), " for input #", size_input_idx,
-          " in ", c->DebugString());
-    }
-    auto vec = size_tensor->vec<int32>();
-    height = c->MakeDim(vec(0));
-    width = c->MakeDim(vec(1));
-  }
-  c->set_output(0, c->MakeShape({batch_dim, height, width, channel_dim}));
-  return Status::OK();
-}
-
-Status ResizeShapeFn(InferenceContext* c) {
-  ShapeHandle input;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
-  return SetOutputToSizedImage(c, c->Dim(input, 0), 2 /* size_input_idx */,
-                               c->Dim(input, 3));
-}
-
-}  // namespace
-
 // TODO(ringwalt): Add a "fill_mode" argument with "constant", "mirror", etc.
 // TODO(ringwalt): Add a "fill_constant" argument for constant mode (default 0).
 // TODO(ringwalt): Add an "output_shape" argument. This is sufficient to
@@ -75,11 +29,13 @@ Status ResizeShapeFn(InferenceContext* c) {
 REGISTER_OP("ImageProjectiveTransform")
     .Input("images: dtype")
     .Input("transforms: float32")
-    .Input("output_shape: int32")
     .Attr("dtype: {uint8, int32, int64, float32, float64}")
     .Attr("interpolation: string")
     .Output("transformed_images: dtype")
-    .SetShapeFn(ResizeShapeFn)
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Applies the given transform to each of the images.
 
diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
index c0151d320f..b50177ae56 100644
--- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
@@ -195,40 +195,10 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
           x_init_value=test_image)
       self.assertLess(left_err, 1e-10)
 
-  def _test_grad_different_shape(self, input_shape, output_shape):
-    with self.test_session():
-      test_image_shape = input_shape
-      test_image = np.random.randn(*test_image_shape)
-      test_image_tensor = constant_op.constant(
-          test_image, shape=test_image_shape)
-      test_transform = image_ops.angles_to_projective_transforms(
-          np.pi / 2, 4, 4)
-
-      if len(output_shape) == 2:
-        resize_shape = output_shape
-      elif len(output_shape) == 3:
-        resize_shape = output_shape[0:2]
-      elif len(output_shape) == 4:
-        resize_shape = output_shape[1:3]
-      output = image_ops.transform(
-          images=test_image_tensor,
-          transforms=test_transform,
-          output_shape=resize_shape)
-      left_err = gradient_checker.compute_gradient_error(
-          test_image_tensor,
-          test_image_shape,
-          output,
-          output_shape,
-          x_init_value=test_image)
-      self.assertLess(left_err, 1e-10)
-
   def test_grad(self):
     self._test_grad([16, 16])
     self._test_grad([4, 12, 12])
     self._test_grad([3, 4, 12, 12])
-    self._test_grad_different_shape([16, 16], [8, 8])
-    self._test_grad_different_shape([4, 12, 3], [8, 24, 3])
-    self._test_grad_different_shape([3, 4, 12, 3], [3, 8, 24, 3])
 
 
 class BipartiteMatchTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index 0cb7bdc75d..c139ae89d8 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -212,11 +212,7 @@ def translations_to_projective_transforms(translations, name=None):
         axis=1)
 
 
-def transform(images,
-              transforms,
-              output_shape=None,
-              interpolation="NEAREST",
-              name=None):
+def transform(images, transforms, interpolation="NEAREST", name=None):
   """Applies the given transform(s) to the image(s).
 
   Args:
@@ -232,10 +228,7 @@ def transform(images,
        where `k = c0 x + c1 y + 1`. The transforms are *inverted* compared to
        the transform mapping input points to output points. Note that gradients
        are not backpropagated into transformation parameters.
-    output_shape: Output dimesion after the transform, [height, width].
-       If None, output is the same size as input image.
     interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR".
-    name: The name of the op.
 
   Returns:
     Image(s) with the same type and shape as `images`, with the given
@@ -262,14 +255,6 @@ def transform(images,
     else:
       raise TypeError("Images should have rank between 2 and 4.")
 
-    if output_shape is None:
-      output_shape = images.get_shape()[1:3]
-    elif len(output_shape) != 2:
-      raise TypeError(
-          "output_shape must either be None or a vector of 2 elements.")
-    output_shape = ops.convert_to_tensor(
-        output_shape, name="output_shape", dtype=dtypes.int32)
-
     if len(transform_or_transforms.get_shape()) == 1:
       transforms = transform_or_transforms[None]
     elif transform_or_transforms.get_shape().ndims is None:
@@ -280,7 +265,7 @@ def transform(images,
     else:
       raise TypeError("Transforms should have rank 1 or 2.")
     output = gen_image_ops.image_projective_transform(
-        images, transforms, output_shape, interpolation=interpolation.upper())
+        images, transforms, interpolation=interpolation.upper())
     if len(image_or_images.get_shape()) == 2:
       return output[0, :, :, 0]
     elif len(image_or_images.get_shape()) == 3:
@@ -390,6 +375,14 @@ def _image_projective_transform_grad(op, grad):
 
   if image_or_images.dtype.base_dtype not in _IMAGE_DTYPES:
     raise TypeError("Invalid dtype %s." % image_or_images.dtype)
+  if len(image_or_images.get_shape()) == 2:
+    images = image_or_images[None, :, :, None]
+  elif len(image_or_images.get_shape()) == 3:
+    images = image_or_images[None, :, :, :]
+  elif len(image_or_images.get_shape()) == 4:
+    images = image_or_images
+  else:
+    raise TypeError("Images should have rank between 2 and 4")
   if len(transform_or_transforms.get_shape()) == 1:
     transforms = transform_or_transforms[None]
   elif len(transform_or_transforms.get_shape()) == 2:
@@ -402,11 +395,13 @@ def _image_projective_transform_grad(op, grad):
   inverse = linalg_ops.matrix_inverse(transforms)
   transforms = matrices_to_flat_transforms(inverse)
   output = gen_image_ops.image_projective_transform(
-      images=grad,
-      transforms=transforms,
-      output_shape=image_or_images.get_shape()[1:3],
-      interpolation=interpolation)
-  return [output, None, None]
+      grad, transforms, interpolation=interpolation)
+  if len(image_or_images.get_shape()) == 2:
+    return [output[0, :, :, 0], None]
+  elif len(image_or_images.get_shape()) == 3:
+    return [output[0, :, :, :], None]
+  else:
+    return [output, None]
 
 
 def bipartite_match(distance_mat,
-- 
GitLab


From c3f5d8c53295d9740c622f5221464c23559747ad Mon Sep 17 00:00:00 2001
From: Yifei Feng <1192265+yifeif@users.noreply.github.com>
Date: Thu, 19 Apr 2018 16:02:09 -0700
Subject: [PATCH 183/450] Update install_python3.5_pip_packages.sh

---
 .../tools/ci_build/install/install_python3.5_pip_packages.sh   | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index aefc49f604..204a82f647 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -39,6 +39,9 @@ if [[ -z $pip35_version ]]; then
 fi
 
 set -e
+pip3.5 install --upgrade setuptools
+pip3.5 install --upgrade pip
+
 pip3.5 install --upgrade virtualenv
 
 # Install six.
-- 
GitLab


From d4402725d2f6d9a8c5273ab1474117a27dd455c9 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 19 Apr 2018 16:30:02 -0700
Subject: [PATCH 184/450] Make xla/service:cpu_plugin depend on the
 StreamExecutor host platform.

PiperOrigin-RevId: 193593761
---
 tensorflow/compiler/xla/service/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 9009cbf845..d5d09bd8a3 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -699,6 +699,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu:cpu_transfer_manager",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:stream_executor_impl",
     ],
 )
 
-- 
GitLab


From 704ac94a8e362feb3710391787342fe36187b9ef Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Thu, 19 Apr 2018 16:30:26 -0700
Subject: [PATCH 185/450] Cleaned up the handling of merge nodes

PiperOrigin-RevId: 193593810
---
 .../core/grappler/costs/graph_properties.cc   | 89 +++++++------------
 1 file changed, 32 insertions(+), 57 deletions(-)

diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index dd2d53dfdf..a0125ce342 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -670,6 +670,29 @@ class SymbolicShapeRefiner {
     return true;
   }
 
+  Status AddNode(const Node* node) {
+    // Create the inference context for this node.
+    std::vector<ShapeHandle> input_shapes(node->num_inputs());
+    std::vector<std::unique_ptr<std::vector<ShapeAndType>>>
+        input_handle_shapes_and_types(node->num_inputs());
+    std::vector<const Tensor*> input_tensors(node->num_inputs(), nullptr);
+    std::vector<ShapeHandle> input_tensors_as_shapes;
+
+    NodeContext& node_ctx = node_to_context_[node];
+    TF_RETURN_IF_ERROR(
+        function_library_.LookUp(node->type_string(), &node_ctx.op_data));
+
+    node_ctx.inference_context.reset(new InferenceContext(
+        graph_def_version_, &node->def(), node->op_def(), input_shapes,
+        input_tensors, input_tensors_as_shapes,
+        std::move(input_handle_shapes_and_types)));
+    const Status s = node_ctx.inference_context->construction_status();
+    if (!s.ok()) {
+      node_ctx.inference_context.reset(nullptr);
+    }
+    return s;
+  }
+
  private:
   // Return the one ShapeHandle used to denote a fully unknown shape for a node
   // output.
@@ -698,29 +721,6 @@ class SymbolicShapeRefiner {
     return dim;
   }
 
-  Status AddNode(const Node* node) {
-    // Create the inference context for this node.
-    std::vector<ShapeHandle> input_shapes(node->num_inputs());
-    std::vector<std::unique_ptr<std::vector<ShapeAndType>>>
-        input_handle_shapes_and_types(node->num_inputs());
-    std::vector<const Tensor*> input_tensors(node->num_inputs(), nullptr);
-    std::vector<ShapeHandle> input_tensors_as_shapes;
-
-    NodeContext& node_ctx = node_to_context_[node];
-    TF_RETURN_IF_ERROR(
-        function_library_.LookUp(node->type_string(), &node_ctx.op_data));
-
-    node_ctx.inference_context.reset(new InferenceContext(
-        graph_def_version_, &node->def(), node->op_def(), input_shapes,
-        input_tensors, input_tensors_as_shapes,
-        std::move(input_handle_shapes_and_types)));
-    const Status s = node_ctx.inference_context->construction_status();
-    if (!s.ok()) {
-      node_ctx.inference_context.reset(nullptr);
-    }
-    return s;
-  }
-
   struct NodeContext {
     const OpRegistrationData* op_data;
     std::unique_ptr<InferenceContext> inference_context;
@@ -929,37 +929,16 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
                                         bool* new_shapes) const {
   InferenceContext* c = shape_refiner->GetContext(node);
   if (!c) {
-    // The shape refiner can't handle loops. Therefore we first need to remove
-    // all edges
-    std::vector<Edge> edges;
-    std::vector<const Edge*> edge_ptrs;
-    for (const Edge* edge : node->in_edges()) {
-      if (!edge->IsControlEdge()) {
-        edges.push_back(*edge);
-        edge_ptrs.push_back(edge);
-      }
-    }
-    for (const Edge* edge : edge_ptrs) {
-      if (!edge->IsControlEdge()) {
-        graph_->RemoveEdge(edge);
-      }
-    }
     // Now we can run shape inference
-    TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(node, relax, new_shapes));
-    // And add all the edges back
-    for (const Edge& edge : edges) {
-      graph_->AddEdge(edge.src(), edge.src_output(), edge.dst(),
-                      edge.dst_input());
-    }
-
-    c = shape_refiner->GetContext(node);
+    TF_RETURN_IF_ERROR(shape_refiner->AddNode(node));
+    c = CHECK_NOTNULL(shape_refiner->GetContext(node));
     *new_shapes = true;
-    CHECK_NE(c, nullptr);
-  }
 
-  ShapeHandle out1;
-  TF_RETURN_IF_ERROR(c->WithRank(c->output(1), 0, &out1));
-  c->set_output(1, out1);
+    // Infer the shape of the second output once and for all since it never
+    // changes.
+    ShapeHandle out1 = c->Scalar();
+    c->set_output(1, out1);
+  }
 
   ShapeHandle out;
   bool out_initialized = false;
@@ -981,11 +960,7 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
       continue;
     }
     ShapeHandle input = in->output(e->src_output());
-    if (relax) {
-      c->RelaxInput(e->dst_input(), input);
-    } else {
-      c->MergeInput(e->dst_input(), input);
-    }
+    c->SetInput(e->dst_input(), input);
     if (!out_initialized) {
       out_initialized = true;
       out = input;
@@ -998,7 +973,7 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
     }
   }
 
-  if (!shape_refiner->EquivalentShapes(out, c->output(0))) {
+  if (*new_shapes || !shape_refiner->EquivalentShapes(out, c->output(0))) {
     c->set_output(0, out);
     *new_shapes = true;
   }
-- 
GitLab


From c93a883fcea141dc0f63fe63afcd9490e39e3eaf Mon Sep 17 00:00:00 2001
From: Mark Heffernan <meheff@google.com>
Date: Thu, 19 Apr 2018 16:35:40 -0700
Subject: [PATCH 186/450] Improve error messages for LiteralTestUtil::Near.
 Previously error messages for mismatches were difficult to read with much of
 the space taken by useless stack traces. This CL cleans up the message
 considerably and adds additional information including statistics about the
 values and mismatches.

PiperOrigin-RevId: 193594593
---
 .../compiler/xla/tests/literal_test_util.cc   | 772 +++++++++++-------
 .../compiler/xla/tests/literal_test_util.h    |   9 +-
 .../xla/tests/literal_test_util_test.cc       |   2 +-
 3 files changed, 473 insertions(+), 310 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index 81630df34c..c28f79ae38 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -39,6 +39,11 @@ limitations under the License.
 
 namespace xla {
 
+using ::tensorflow::strings::Appendf;
+using ::tensorflow::strings::Printf;
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
+
 /* static */ ::testing::AssertionResult LiteralTestUtil::EqualShapes(
     const Shape& expected, const Shape& actual) {
   if (ShapeUtil::IsTuple(expected) != ShapeUtil::IsTuple(actual)) {
@@ -173,14 +178,11 @@ template <typename FloatT, typename UnsignedT>
   auto lhs_double = static_cast<double>(lhs);
   auto rhs_double = static_cast<double>(rhs);
   if (ulhs != urhs) {
-    return ::testing::AssertionFailure() << tensorflow::strings::Printf(
+    return ::testing::AssertionFailure() << Printf(
                "floating values are not bitwise-equal; and equality testing "
                "was requested: %s=%g=%a vs %s=%g=%a",
-               tensorflow::strings::StrCat(tensorflow::strings::Hex(ulhs))
-                   .c_str(),
-               lhs_double, lhs_double,
-               tensorflow::strings::StrCat(tensorflow::strings::Hex(urhs))
-                   .c_str(),
+               StrCat(tensorflow::strings::Hex(ulhs)).c_str(), lhs_double,
+               lhs_double, StrCat(tensorflow::strings::Hex(urhs)).c_str(),
                rhs_double, rhs_double);
   }
   return ::testing::AssertionSuccess();
@@ -264,9 +266,7 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
       << "expected:\n"
       << expected.ToString() << "\n\tvs actual:\n"
       << actual.ToString()
-      << (message.empty()
-              ? ""
-              : tensorflow::strings::StrCat("\nmessage: ", message));
+      << (message.empty() ? "" : StrCat("\nmessage: ", message));
 }
 
 /* static */ void LiteralTestUtil::ExpectNotEqual(const Literal& expected,
@@ -321,9 +321,8 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
     case TUPLE: {
       bool tuple_match = true;
       for (int i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
-        SCOPED_TRACE(tensorflow::strings::StrCat(
-            "Tuple index ", i, " in ",
-            ShapeUtil::HumanString(expected.shape())));
+        SCOPED_TRACE(StrCat("Tuple index ", i, " in ",
+                            ShapeUtil::HumanString(expected.shape())));
 
         // Create LiteralViews of the expected and actual elements.
         auto result = Equal(LiteralView::Create(expected, {i}),
@@ -350,227 +349,301 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
 
 namespace {
 
+// Gets the total element count.  For tuples, this is not the count of tuple
+// elements, but the sum of elements of each tuple element.
+int64 RecursiveElementCount(const Shape& shape) {
+  if (ShapeUtil::IsTuple(shape)) {
+    const int64 tuple_elements = ShapeUtil::TupleElementCount(shape);
+    int64 total = 0;
+    for (int64 i = 0; i < tuple_elements; ++i) {
+      total += RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i));
+    }
+    return total;
+  } else {
+    return ShapeUtil::ElementsIn(shape);
+  }
+}
+
+// Calling ToString on a literal with over 100 million elements takes around
+// 3 minutes.  The utility of printing a literal with >1000 elements is
+// questionable, especially when writing the Literal proto to disk is orders
+// of magnitude faster.
+string TruncateHugeLiteral(const Literal& literal) {
+  return RecursiveElementCount(literal.shape()) < 1000
+             ? literal.ToString()
+             : "[TRUNCATED, Literal with more than 1000 values]";
+}
+
+// Returns whether the actual and expected values are mismatched with respect to
+// nans. 'relaxed_nans' is interpreted as in xla::ErrorSpec.
+template <typename NativeT>
+bool NanMismatch(NativeT expected, NativeT actual, bool relaxed_nans) {
+  if (relaxed_nans) {
+    return !std::isnan(expected) && std::isnan(actual);
+  } else {
+    return std::isnan(expected) != std::isnan(actual);
+  }
+}
+
+template <>
+bool NanMismatch<complex64>(complex64 expected, complex64 actual,
+                            bool relaxed_nans) {
+  return NanMismatch<float>(expected.real(), actual.real(), relaxed_nans) ||
+         NanMismatch<float>(expected.imag(), actual.imag(), relaxed_nans);
+}
+
+template <>
+bool NanMismatch<half>(half expected, half actual, bool relaxed_nans) {
+  return NanMismatch<float>(static_cast<float>(expected),
+                            static_cast<float>(actual), relaxed_nans);
+}
+
+// Converts the given floating-point value to a string.
+template <typename NativeT>
+string FpValueToString(NativeT value) {
+  return Printf("%8.4g", static_cast<double>(value));
+}
+
+template <>
+string FpValueToString<complex64>(complex64 value) {
+  return Printf("%8.4g + %8.4fi", value.real(), value.imag());
+}
+
+// Returns the absolute value of the given floating point value. This function
+// is used instead of std::abs directly in order to allow type-dependent
+// implementations for NearComparator.
+template <typename NativeT>
+float FpAbsoluteValue(NativeT value) {
+  return std::abs(value);
+}
+
+template <>
+float FpAbsoluteValue(bfloat16 value) {
+  return FpAbsoluteValue<float>(static_cast<float>(value));
+}
+
+template <>
+float FpAbsoluteValue(half value) {
+  return FpAbsoluteValue<float>(static_cast<float>(value));
+}
+
 // Helper class for comparing floating-point literals within an error bound.
+template <typename NativeT>
 class NearComparator {
  public:
-  explicit NearComparator(ErrorSpec error) : error_(error) {}
+  // Compares the two array literals elementwise and returns an assertion
+  // result. The assertion result is successful if all actual and expected
+  // elements are within the given error bound. In case of error, the assertion
+  // result contains a detailed error message in case of failure.
+  static ::testing::AssertionResult Compare(const Literal& expected,
+                                            const Literal& actual,
+                                            ErrorSpec error,
+                                            bool detailed_message) {
+    NearComparator<NativeT> comparator(expected, actual, error,
+                                       detailed_message);
+    return comparator.Run();
+  }
+
+ private:
+  // Data structure encapsulating metadata about a single element mismatch.
+  struct Mismatch {
+    NativeT actual;
+    NativeT expected;
+    float rel_error;
+    float abs_error;
+
+    // The linear index of the failure within the shape. This linear index is
+    // from the 'actual' literal.
+    int64 linear_index;
+
+    bool operator<(const Mismatch& other) const {
+      return rel_error < other.rel_error;
+    }
 
-  // Compares the two literals elementwise. EXPECTs each pair of elements to be
-  // within the error bound. Emits useful log messages and dumps literals to
-  // temporary files on failure. Returns true if  literals match.
-  bool ExpectNear(const Literal& expected, const Literal& actual) {
+    string ToString(const Shape& shape) const {
+      return Printf(
+          "actual %s, expected %s, index %s, rel error %8.3g, abs error %8.3g",
+          FpValueToString(actual).c_str(), FpValueToString(expected).c_str(),
+          LiteralTestUtil::MultiIndexAsString(
+              IndexUtil::LinearIndexToMultidimensionalIndex(shape,
+                                                            linear_index))
+              .c_str(),
+          rel_error, abs_error);
+    }
+  };
+
+  explicit NearComparator(const Literal& expected, const Literal& actual,
+                          ErrorSpec error, bool detailed_message)
+      : expected_(expected),
+        actual_(actual),
+        error_(error),
+        detailed_message_(detailed_message),
+        abs_value_buckets_(kAbsValueBucketBounds.size() - 1, {0, 0}),
+        abs_error_buckets_(kErrorBucketBounds.size(), 0),
+        rel_error_buckets_(kErrorBucketBounds.size(), 0) {}
+
+  // Runs the comparison between expected and actual literals.
+  ::testing::AssertionResult Run() {
     VLOG(1) << "expected:";
-    XLA_VLOG_LINES(1, TruncateHugeLiteral(expected));
+    XLA_VLOG_LINES(1, TruncateHugeLiteral(expected_));
     VLOG(1) << "actual:";
-    XLA_VLOG_LINES(1, TruncateHugeLiteral(actual));
+    XLA_VLOG_LINES(1, TruncateHugeLiteral(actual_));
 
     // If the shapes mismatch, we simply fail the expectation instead of
     // printing out data, as it's a type error rather than a value error.
     ::testing::AssertionResult equal_shapes =
-        LiteralTestUtil::EqualShapes(expected.shape(), actual.shape());
+        LiteralTestUtil::EqualShapes(expected_.shape(), actual_.shape());
     if (!equal_shapes) {
-      EXPECT_TRUE(equal_shapes);
-      return false;
+      return equal_shapes;
     }
-
-    // Set up members used during the comparison.
-    num_miscompares_ = 0;
-    abs_diff_sum_ = 0.0;
-    abs_expected_sum_ = 0.0;
-    abs_diff_miscompare_sum_ = 0.0;
-    abs_expected_miscompare_sum_ = 0.0;
-    max_rel_err_ = 0.0;
-    max_abs_err_ = 0.0;
-    first_linear_index_ = -1;
-    last_linear_index_ = -1;
-    max_rel_linear_index_ = -1;
-    max_abs_linear_index_ = -1;
-    miscompares_ = Literal(ShapeUtil::ChangeElementType(actual.shape(), PRED));
-    miscompares_.PopulateWithValue(false);
-    multi_index_.resize(expected.shape().dimensions_size(), 0);
-
-    switch (expected.shape().element_type()) {
-      case BF16:
-        ExpectLiteralsNear<bfloat16>(expected, actual, 0);
-        break;
-      case F16:
-        ExpectLiteralsNear<half>(expected, actual, 0);
-        break;
-      case F32:
-        ExpectLiteralsNear<float>(expected, actual, 0);
-        break;
-      case F64:
-        ExpectLiteralsNear<double>(expected, actual, 0);
-        break;
-      case C64:
-        ExpectLiteralsNear<complex64>(expected, actual, 0);
-        break;
-      default:
-        LOG(FATAL) << "Unsupported primitive type in near comparator: "
-                   << PrimitiveType_Name(expected.shape().element_type())
-                   << ". Must be floating-point type.";
+    if (!ShapeUtil::IsArray(expected_.shape())) {
+      return ::testing::AssertionFailure() << "Expected array shape";
     }
 
-    if (num_miscompares_ > 0) {
-      if (!VLOG_IS_ON(1)) {
-        LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected.shape())
-                  << " " << TruncateHugeLiteral(expected);
-        LOG(INFO) << "actual:   " << ShapeUtil::HumanString(actual.shape())
-                  << " " << TruncateHugeLiteral(actual);
-        LOG(INFO) << "Dumping literals to temp files...";
-        WriteLiteralToTempFile(expected, "expected");
-        WriteLiteralToTempFile(actual, "actual");
-        WriteLiteralToTempFile(miscompares_, "miscompares");
-      }
-      EXPECT_TRUE(num_miscompares_ == 0)
-          << "\nmax relative mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(
-                 IndexUtil::LinearIndexToMultidimensionalIndex(
-                     actual.shape(), max_rel_linear_index_))
-          << "\nmaximum relative error " << max_rel_err_
-          << "\nmax absolute mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(
-                 IndexUtil::LinearIndexToMultidimensionalIndex(
-                     actual.shape(), max_abs_linear_index_))
-          << "\nmaximum absolute error " << max_abs_err_
-          << "\nfirst mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(
-                 IndexUtil::LinearIndexToMultidimensionalIndex(
-                     actual.shape(), first_linear_index_))
-          << "\nlast mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(
-                 IndexUtil::LinearIndexToMultidimensionalIndex(
-                     actual.shape(), last_linear_index_))
-          << "\ntotal absolute error " << abs_diff_sum_
-          << "\ntotal absolute error of miscompares "
-          << abs_diff_miscompare_sum_ << "\ntotal relative error "
-          << (abs_diff_sum_ / abs_expected_sum_)
-          << "\ntotal relative error of miscompares "
-          << (abs_diff_miscompare_sum_ / abs_expected_miscompare_sum_)
-          << "\nfailure count " << num_miscompares_;
+    mismatches_ = Literal(ShapeUtil::ChangeElementType(actual_.shape(), PRED));
+    mismatches_.PopulateWithValue(false);
+
+    CompareLiterals();
+
+    if (num_mismatches_ == 0) {
+      return ::testing::AssertionSuccess();
+    } else if (!VLOG_IS_ON(1)) {
+      LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected_.shape())
+                << " " << TruncateHugeLiteral(expected_);
+      LOG(INFO) << "actual:   " << ShapeUtil::HumanString(actual_.shape())
+                << " " << TruncateHugeLiteral(actual_);
+      LOG(INFO) << "Dumping literals to temp files...";
+      WriteLiteralToTempFile(expected_, "expected");
+      WriteLiteralToTempFile(actual_, "actual");
+      WriteLiteralToTempFile(mismatches_, "mismatches");
     }
-    return num_miscompares_ == 0;
+    return ::testing::AssertionFailure() << ErrorMessage();
   }
 
- private:
-  template <typename NativeT>
-  bool NanMismatch(NativeT expected, NativeT actual, bool relaxed_nans) {
-    if (relaxed_nans) {
-      return !std::isnan(expected) && std::isnan(actual);
-    } else {
-      return std::isnan(expected) != std::isnan(actual);
+  // Insert the given absolute value into the absolute value bucket vector. The
+  // bounds of the buckets are given by kAbsValueBucketBounds.
+  void UpdateAbsValueBucket(NativeT value, bool is_mismatch) {
+    // Adjust the bucket containing the absolute values of the 'actual'
+    // elements.
+    const float abs_value = FpAbsoluteValue(value);
+    for (int i = 0; i < abs_value_buckets_.size(); ++i) {
+      if (i == abs_value_buckets_.size() - 1 ||
+          (abs_value >= kAbsValueBucketBounds[i] &&
+           abs_value < kAbsValueBucketBounds[i + 1])) {
+        // The first value of the pair is the count of elements in the bucket,
+        // the second is the count of mismatches in the bucket.
+        abs_value_buckets_[i].first++;
+        if (is_mismatch) {
+          abs_value_buckets_[i].second++;
+        }
+        return;
+      }
     }
   }
 
-  template <typename NativeT>
-  void ExpectNear(NativeT expected, NativeT actual,
-                  const ::testing::Message& message) {
-    EXPECT_NEAR(expected, actual, error_.abs)
-        << "expected:\n  " << expected << "\n\tvs actual:\n  " << actual << "\n"
-        << message;
-  }
-
-  // EXPECTs that the two given scalar values are within the error bound. Keeps
-  // track of how many mismatches have occurred to keep the size of the output
-  // manageable.
-  template <typename NativeT>
-  bool ExpectValuesNear(NativeT expected, NativeT actual) {
-    if (expected == actual) {
-      return true;
+  // Insert the given error into the given error bucket vector.
+  void UpdateErrorBucket(
+      float error, tensorflow::gtl::MutableArraySlice<int64> error_buckets) {
+    CHECK_EQ(error_buckets.size(), kErrorBucketBounds.size());
+    for (int i = 0; i < error_buckets.size(); ++i) {
+      if (error >= kErrorBucketBounds[i]) {
+        error_buckets[i]++;
+      }
     }
-
-    const float abs_diff = std::abs(actual - expected);
-    const float rel_err = abs_diff / std::abs(expected);
-    const bool nan_mismatch =
-        NanMismatch<NativeT>(expected, actual, error_.relaxed_nans);
-    const bool mismatch =
-        (nan_mismatch || (abs_diff >= error_.abs && rel_err >= error_.rel));
-    return !mismatch;
   }
 
-  // Assumes that expected vs actual fail ExpectValuesNear.
-  template <typename NativeT>
-  void UpdateAndLogMiscompares(const NativeT expected, const NativeT actual,
-                               const Shape& shape, const int64 linear_index) {
-    const float abs_diff = std::abs(actual - expected);
-    const float rel_err = abs_diff / std::abs(expected);
-    abs_diff_sum_ += abs_diff;
-    abs_expected_sum_ += std::abs(expected);
-    if (rel_err > max_rel_err_ || std::isnan(rel_err)) {
-      max_rel_err_ = rel_err;
-      max_rel_linear_index_ = linear_index;
+  // Compares the two given elements from the expected and actual literals at
+  // the given literal_index and keeps track of various mismatch statistics.
+  void CompareValues(NativeT expected, NativeT actual, int64 linear_index) {
+    const bool is_nan_mismatch =
+        NanMismatch(expected, actual, error_.relaxed_nans);
+    float abs_error;
+    float rel_error;
+    if (actual == expected) {
+      abs_error = 0;
+      rel_error = 0;
+    } else if (is_nan_mismatch) {
+      num_nan_mismatches_++;
+      // A nan mismatch is considered to have infinite error. rel_error is used
+      // for sorting a std::set of the top mismatchs, and a nan value here will
+      // result in undefined behavior because nan's do not satisfy the strict
+      // weak ordering requirement of std containers.
+      abs_error = std::numeric_limits<float>::infinity();
+      rel_error = std::numeric_limits<float>::infinity();
+    } else {
+      abs_error = FpAbsoluteValue(actual - expected);
+      rel_error = abs_error / FpAbsoluteValue(expected);
     }
-    if (abs_diff > max_abs_err_ || std::isnan(abs_diff)) {
-      max_abs_err_ = abs_diff;
-      max_abs_linear_index_ = linear_index;
+    const bool is_abs_mismatch = abs_error > error_.abs;
+    const bool is_rel_mismatch = rel_error > error_.rel;
+    const bool is_mismatch =
+        is_nan_mismatch || (is_abs_mismatch && is_rel_mismatch);
+
+    // Update the error of the relative bucket only if the *absolute* error
+    // bound is exceeded and vice versa.
+    if (is_abs_mismatch) {
+      num_abs_mismatches_++;
+      UpdateErrorBucket(rel_error, &rel_error_buckets_);
     }
-    if (VLOG_IS_ON(10)) {
-      VLOG(10) << tensorflow::strings::Printf(
-          "index %s abs_diff %f rel_err %f",
-          LiteralTestUtil::MultiIndexAsString(
-              IndexUtil::LinearIndexToMultidimensionalIndex(shape,
-                                                            linear_index))
-              .c_str(),
-          abs_diff, rel_err);
+    if (is_rel_mismatch) {
+      num_rel_mismatches_++;
+      UpdateErrorBucket(abs_error, &abs_error_buckets_);
     }
-    abs_diff_miscompare_sum_ += abs_diff;
-    abs_expected_miscompare_sum_ += std::abs(expected);
-    const int64 kMaxFailures = 2;
-    if (num_miscompares_ < kMaxFailures) {
-      const auto multi_index =
-          IndexUtil::LinearIndexToMultidimensionalIndex(shape, linear_index);
-      ::testing::Message msg;
-      msg << "mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(multi_index) << " abs diff "
-          << abs_diff << " rel err " << rel_err << " failure #"
-          << num_miscompares_;
-      ExpectNear<NativeT>(expected, actual, msg);
-    } else if (num_miscompares_ == kMaxFailures) {
-      LOG(ERROR) << "reached max 'loud' failure count; silently proceeding...";
+
+    UpdateAbsValueBucket(actual, is_mismatch);
+
+    if (!is_mismatch) {
+      return;
     }
-    if (num_miscompares_ == 0) {
-      first_linear_index_ = linear_index;
+
+    num_mismatches_++;
+
+    // Keep track of the kTopRelativeErrorCount relative error mismatches.
+    if (top_rel_mismatches_.size() < kTopRelativeErrorCount ||
+        rel_error > top_rel_mismatches_.begin()->rel_error) {
+      Mismatch mismatch = {actual, expected, rel_error, abs_error,
+                           linear_index};
+      top_rel_mismatches_.insert(mismatch);
+      if (top_rel_mismatches_.size() > kTopRelativeErrorCount) {
+        top_rel_mismatches_.erase(top_rel_mismatches_.begin());
+      }
     }
-    num_miscompares_++;
-    last_linear_index_ = linear_index;
-    miscompares_.data<bool>()[linear_index] = true;
+
+    mismatches_.data<bool>()[linear_index] = true;
   }
 
-  // Recursive function which compares the two given literals elementwise.
-  template <typename NativeT>
-  void ExpectLiteralsNear(const Literal& expected, const Literal& actual,
-                          int64 dimension) {
+  // Compares the two literals elementwise.
+  void CompareLiterals() {
     // Fast path optimization for the case were layouts match.
-    if (LayoutUtil::Equal(actual.shape().layout(), expected.shape().layout())) {
+    if (LayoutUtil::Equal(actual_.shape().layout(),
+                          expected_.shape().layout())) {
       tensorflow::gtl::ArraySlice<const NativeT> expected_data =
-          expected.data<NativeT>();
+          expected_.data<NativeT>();
       tensorflow::gtl::ArraySlice<const NativeT> actual_data =
-          actual.data<NativeT>();
+          actual_.data<NativeT>();
       const int64 len = expected_data.size();
       for (int64 i = 0; i < len; ++i) {
-        const bool near = ExpectValuesNear(expected_data[i], actual_data[i]);
-        if (!near) {
-          UpdateAndLogMiscompares<NativeT>(expected_data[i], actual_data[i],
-                                           actual.shape(), i);
-        }
+        CompareValues(expected_data[i], actual_data[i], i);
       }
       return;
     }
+    std::vector<int64> multi_index(ShapeUtil::Rank(actual_.shape()), 0);
+    CompareLiteralsSlow(0, &multi_index);
+  }
 
-    if (dimension == expected.shape().dimensions_size()) {
-      bool near = ExpectValuesNear(expected.Get<NativeT>(multi_index_),
-                                   actual.Get<NativeT>(multi_index_));
-      if (!near) {
-        UpdateAndLogMiscompares<NativeT>(
-            expected.Get<NativeT>(multi_index_),
-            actual.Get<NativeT>(multi_index_), actual.shape(),
-            IndexUtil::MultidimensionalIndexToLinearIndex(actual.shape(),
-                                                          multi_index_));
-      }
+  // Slow path for CompareLiterals when 'actual' and 'expected' literals have
+  // different layouts. In this case, multidimensional indices are constructed
+  // and indexed for each element.
+  void CompareLiteralsSlow(int64 dimension, std::vector<int64>* multi_index) {
+    if (dimension == multi_index->size()) {
+      CompareValues(expected_.Get<NativeT>(*multi_index),
+                    actual_.Get<NativeT>(*multi_index),
+                    IndexUtil::MultidimensionalIndexToLinearIndex(
+                        actual_.shape(), *multi_index));
     } else {
-      for (int64 i = 0; i < expected.shape().dimensions(dimension); ++i) {
-        multi_index_[dimension] = i;
-        ExpectLiteralsNear<NativeT>(expected, actual, dimension + 1);
+      for (int64 i = 0; i < expected_.shape().dimensions(dimension); ++i) {
+        (*multi_index)[dimension] = i;
+        CompareLiteralsSlow(dimension + 1, multi_index);
       }
     }
   }
@@ -580,159 +653,247 @@ class NearComparator {
     int64 now_usec = tensorflow::Env::Default()->NowMicros();
     string filename = tensorflow::io::JoinPath(
         tensorflow::testing::TmpDir(),
-        tensorflow::strings::Printf("tempfile-%s-%llx-%s", Hostname().c_str(),
-                                    now_usec, name.c_str()));
+        Printf("tempfile-%s-%llx-%s", Hostname().c_str(), now_usec,
+               name.c_str()));
     TF_CHECK_OK(tensorflow::WriteBinaryProto(tensorflow::Env::Default(),
                                              filename, literal.ToProto()));
     LOG(ERROR) << "wrote to " << name << " file: " << filename;
   }
 
-  // Gets the total element count.  For tuples, this is not the count of tuple
-  // elements, but the sum of elements of each tuple element.
-  int64 RecursiveElementCount(const Shape& shape) {
-    if (ShapeUtil::IsTuple(shape)) {
-      const int64 tuple_elements = ShapeUtil::TupleElementCount(shape);
-      int64 total = 0;
-      for (int64 i = 0; i < tuple_elements; ++i) {
-        total +=
-            RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i));
-      }
-      return total;
-    } else {
-      return ShapeUtil::ElementsIn(shape);
+  // Returns an error message string with a detailed breakdown of the
+  // mismatches. Called after calling Run().
+  string ErrorMessage() {
+    string out;
+    int64 element_count = ShapeUtil::ElementsIn(actual_.shape());
+
+    auto percent_string = [](float a, float b) {
+      float pct = b == 0.0 ? 0.0 : 100.0 * a / b;
+      return Printf("%0.4f%%", pct);
+    };
+
+    Appendf(&out,
+            "\nMismatch count %lld (%s) in shape %s (%lld elements), abs bound "
+            "%g, rel bound %g\n",
+            num_mismatches_,
+            percent_string(num_mismatches_, element_count).c_str(),
+            ShapeUtil::HumanString(actual_.shape()).c_str(),
+            ShapeUtil::ElementsIn(actual_.shape()), error_.abs, error_.rel);
+    if (num_nan_mismatches_ > 0) {
+      StrAppend(&out, "nan mismatches ", num_nan_mismatches_, "\n");
+    }
+    Appendf(&out, "Top relative error mismatches:\n");
+    for (auto it = top_rel_mismatches_.rbegin();
+         it != top_rel_mismatches_.rend(); ++it) {
+      StrAppend(&out, "  ", it->ToString(actual_.shape()).c_str(), "\n");
     }
-  }
 
-  // Calling ToString on a literal with over 100 million elements takes around
-  // 3 minutes.  The utility of printing a literal with >1000 elements is
-  // questionable, especially when writing the Literal proto to disk is orders
-  // of magnitude faster.
-  string TruncateHugeLiteral(const Literal& literal) {
-    return RecursiveElementCount(literal.shape()) < 1000
-               ? literal.ToString()
-               : "[TRUNCATED, Literal with more than 1000 values]";
-  }
+    if (!detailed_message_) {
+      return out;
+    }
 
-  ErrorSpec error_;
+    StrAppend(&out, "Absolute magnitude breakdown of actual values:\n");
+    CHECK_EQ(abs_value_buckets_.size() + 1, kAbsValueBucketBounds.size());
+    for (int i = 0; i < abs_value_buckets_.size(); ++i) {
+      const int64 bucket_size = abs_value_buckets_[i].first;
+      const int64 bucket_mismatches = abs_value_buckets_[i].second;
+      string mismatch_str = bucket_mismatches > 0
+                                ? Printf(", mismatches %lld", bucket_mismatches)
+                                : "";
+      Appendf(&out, "  %-6g <= x < %-6g : %7lld (%9s)%s\n",
+              kAbsValueBucketBounds[i], kAbsValueBucketBounds[i + 1],
+              bucket_size, percent_string(bucket_size, element_count).c_str(),
+              mismatch_str.c_str());
+    }
 
-  // Number of element miscomparisons encountered so far.
-  int64 num_miscompares_;
+    auto print_accum_buckets = [&](const string& header, int64 total,
+                                   tensorflow::gtl::ArraySlice<int64> buckets) {
+      StrAppend(&out, header, ":\n");
+      Appendf(&out, "  <  %-6g : %7lld (%s)\n", kErrorBucketBounds[0],
+              total - buckets[0],
+              percent_string(total - buckets[0], total).c_str());
+      CHECK_EQ(buckets.size(), kErrorBucketBounds.size());
+      for (int i = 0; i < kErrorBucketBounds.size(); ++i) {
+        Appendf(&out, "  >= %-6g : %7lld (%s)\n", kErrorBucketBounds[i],
+                buckets[i], percent_string(buckets[i], total).c_str());
+      }
+    };
+    Appendf(&out, "Elements exceeding abs error bound %g: %lld (%s)\n",
+            error_.abs, num_abs_mismatches_,
+            percent_string(num_abs_mismatches_, element_count).c_str());
+    print_accum_buckets(
+        "Relative error breakdown of elements exceeding abs error bound",
+        num_abs_mismatches_, rel_error_buckets_);
+    Appendf(&out, "Elements exceeding rel error bound %g: %lld (%s)\n",
+            error_.rel, num_rel_mismatches_,
+            percent_string(num_rel_mismatches_, element_count).c_str());
+    print_accum_buckets(
+        "Absolute error breakdown of elements exceeding rel error bound",
+        num_rel_mismatches_, abs_error_buckets_);
+    return out;
+  }
 
-  // A Literal containing which elements did not match in the expected and
-  // actual literals. miscompares_ contains PREDs and is of the same sizes as
-  // the comparison literals.
-  Literal miscompares_;
-
-  // A multidimensional index used when performing the recursive comparison.
-  std::vector<int64> multi_index_;
-
-  // Aggregated Statistics on input.
-  double abs_diff_sum_;
-  double abs_expected_sum_;
-  double abs_diff_miscompare_sum_;
-  double abs_expected_miscompare_sum_;
-  float max_rel_err_;
-  float max_abs_err_;
-  int64 first_linear_index_;
-  int64 last_linear_index_;
-  int64 max_rel_linear_index_;
-  int64 max_abs_linear_index_;
-};
+  // 'actual' and 'expected' literals being compared.
+  const Literal& expected_;
+  const Literal& actual_;
 
-template <>
-bool NearComparator::NanMismatch<complex64>(complex64 expected,
-                                            complex64 actual,
-                                            bool relaxed_nans) {
-  return NanMismatch(expected.real(), actual.real(), relaxed_nans) ||
-         NanMismatch(expected.imag(), actual.imag(), relaxed_nans);
-}
+  // The error bounds of the comparison.
+  ErrorSpec error_;
 
-template <>
-void NearComparator::ExpectNear<complex64>(complex64 expected, complex64 actual,
-                                           const ::testing::Message& message) {
-  EXPECT_NEAR(expected.real(), actual.real(), error_.abs)
-      << "expected:\n  " << expected << "\n\tvs actual:\n  " << actual << "\n"
-      << message;
-  EXPECT_NEAR(expected.imag(), actual.imag(), error_.abs)
-      << "expected:\n  " << expected << "\n\tvs actual:\n  " << actual << "\n"
-      << message;
-}
+  // Whether to include detailed breakdown of mismatches in the error message.
+  bool detailed_message_;
 
-template <>
-bool NearComparator::ExpectValuesNear<bfloat16>(bfloat16 expected,
-                                                bfloat16 actual) {
-  return ExpectValuesNear(static_cast<float>(expected),
-                          static_cast<float>(actual));
-}
+  // Number of element element mismatches encountered so far.
+  int64 num_mismatches_ = 0;
 
-template <>
-bool NearComparator::ExpectValuesNear<half>(half expected, half actual) {
-  return ExpectValuesNear(static_cast<float>(std::move(expected)),
-                          static_cast<float>(std::move(actual)));
-}
+  // Number of elements with a nan mismatch.
+  int64 num_nan_mismatches_ = 0;
 
-template <>
-void NearComparator::UpdateAndLogMiscompares<bfloat16>(
-    const bfloat16 expected, const bfloat16 actual, const Shape& shape,
-    const int64 linear_index) {
-  UpdateAndLogMiscompares(static_cast<float>(expected),
-                          static_cast<float>(actual), shape, linear_index);
-}
+  // Number of elements which exceed the absolute/relative error bound.
+  int64 num_abs_mismatches_ = 0;
+  int64 num_rel_mismatches_ = 0;
 
-template <>
-void NearComparator::UpdateAndLogMiscompares<half>(half expected, half actual,
-                                                   const Shape& shape,
-                                                   const int64 linear_index) {
-  UpdateAndLogMiscompares(static_cast<float>(std::move(expected)),
-                          static_cast<float>(std::move(actual)), shape,
-                          linear_index);
-}
-
-}  // namespace
+  // A Literal containing which elements did not match in the expected and
+  // actual literals. mismatches_ contains PREDs and is of the same sizes as
+  // the comparison literals.
+  Literal mismatches_;
+
+  // The number of mismatches to report in the output, sorted by relative error
+  // magnitude.
+  static constexpr int64 kTopRelativeErrorCount = 5;
+
+  // The set of mismatches with the largest relative error. The size of this set
+  // is bounded by kTopRelativeErrorCount.
+  std::multiset<Mismatch> top_rel_mismatches_;
+
+  // Actual values are bucketed by absolute value. kAbsValueBucketBounds is the
+  // bounds of these buckets. abs_value_buckets_ contains a pair for each
+  // bucket: the element count and failure count.
+  static constexpr std::array<float, 7> kAbsValueBucketBounds = {
+      0.0, 0.0001, 0.001, 0.01, 0.1, 1, std::numeric_limits<float>::infinity()};
+  std::vector<std::pair<int64, int64>> abs_value_buckets_;
+
+  // Buckets for relative and absolute errors. The relative error buckets only
+  // contains those elements which exceed the *absolute* error bound, and vice
+  // versa. This makes it easy to see the effect of adjusting the relative (or
+  // absolute) error bound on the success of the comparison. kErrorBucketBounds
+  // are the lower bounds of the buckets in both vectors. The error buckets are
+  // a cumulative distribution so an error value may appear in more than one
+  // bucket. For example an error value of 0.003 may appear in the buckets
+  // bounded by 0.01, 0.1, and 1.0.
+  static constexpr std::array<float, 5> kErrorBucketBounds = {0.0001, 0.001,
+                                                              0.01, 0.1, 1};
+  std::vector<int64> abs_error_buckets_;
+  std::vector<int64> rel_error_buckets_;
+};
 
-/* static */ ::testing::AssertionResult LiteralTestUtil::Near(
-    const Literal& expected, const Literal& actual, const ErrorSpec& error) {
+template <typename NativeT>
+constexpr std::array<float, 7> NearComparator<NativeT>::kAbsValueBucketBounds;
+template <typename NativeT>
+constexpr std::array<float, 5> NearComparator<NativeT>::kErrorBucketBounds;
+
+// Helper function for comparing two literals for nearness. Handles tuple-shapes
+// via recursion. shape_index is the ShapeIndex of expected (or actual)
+// currently being compared.
+::testing::AssertionResult NearHelper(const Literal& expected,
+                                      const Literal& actual,
+                                      const ErrorSpec& error,
+                                      bool detailed_message,
+                                      const ShapeIndex& shape_index) {
   ::testing::AssertionResult err =
-      EqualShapes(expected.shape(), actual.shape());
+      LiteralTestUtil::EqualShapes(expected.shape(), actual.shape());
   if (!err) {
     return err;
   }
 
   if (ShapeUtil::IsTuple(expected.shape())) {
     for (int64 i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
-      SCOPED_TRACE(tensorflow::strings::StrCat(
-          "Tuple index ", i, " in ", ShapeUtil::HumanString(expected.shape())));
       const auto expected_element = LiteralView::Create(expected, {i});
       const auto actual_element = LiteralView::Create(actual, {i});
-
+      ShapeIndex element_index = shape_index;
+      element_index.push_back(i);
       ::testing::AssertionResult res =
-          Near(expected_element, actual_element, error);
-      if (err && !res) {
-        err = res;
+          NearHelper(expected_element, actual_element, error, detailed_message,
+                     element_index);
+      if (!res) {
+        string err_message =
+            Printf("\nArray at shape index %s%s",
+                   element_index.ToString().c_str(), res.message());
+        if (err) {
+          err = ::testing::AssertionFailure() << err_message;
+        } else {
+          err << err_message;
+        }
       }
     }
+    if (!err && shape_index.empty()) {
+      // Emit a top-level error message containing the top-level shape in case
+      // of mismatch.
+      int64 total_elements = RecursiveElementCount(actual.shape());
+      err = ::testing::AssertionFailure()
+            << Printf("\nMismatches in shape %s (%lld elements):\n%s",
+                      ShapeUtil::HumanString(actual.shape()).c_str(),
+                      total_elements, err.message());
+    }
     return err;
   }
 
   if (ShapeUtil::ElementIsFloating(expected.shape()) ||
       ShapeUtil::ElementIsComplex(expected.shape())) {
-    NearComparator comparator(error);
-    return comparator.ExpectNear(expected, actual)
-               ? ::testing::AssertionSuccess()
-               : ::testing::AssertionFailure() << "values were not near";
+    switch (expected.shape().element_type()) {
+      case BF16:
+        return NearComparator<bfloat16>::Compare(expected, actual, error,
+                                                 detailed_message);
+        break;
+      case F16:
+        return NearComparator<half>::Compare(expected, actual, error,
+                                             detailed_message);
+        break;
+      case F32:
+        return NearComparator<float>::Compare(expected, actual, error,
+                                              detailed_message);
+        break;
+      case F64:
+        return NearComparator<double>::Compare(expected, actual, error,
+                                               detailed_message);
+        break;
+      case C64:
+        return NearComparator<complex64>::Compare(expected, actual, error,
+                                                  detailed_message);
+        break;
+      default:
+        LOG(FATAL) << "Unsupported primitive type in near comparator: "
+                   << PrimitiveType_Name(expected.shape().element_type())
+                   << ". Must be floating-point type.";
+    }
   }
 
-  return Equal(expected, actual);
+  // Non-floating point literal.
+  return LiteralTestUtil::Equal(expected, actual);
+}
+
+}  // namespace
+
+/* static */ ::testing::AssertionResult LiteralTestUtil::Near(
+    const Literal& expected, const Literal& actual, const ErrorSpec& error,
+    bool detailed_message) {
+  return NearHelper(expected, actual, error, detailed_message,
+                    /*shape_index=*/{});
 }
 
 /* static */ void LiteralTestUtil::ExpectNear(const Literal& expected,
                                               const Literal& actual,
                                               const ErrorSpec& error,
                                               const string& message) {
-  EXPECT_TRUE(Near(expected, actual, error))
-      << (message.empty()
-              ? ""
-              : tensorflow::strings::StrCat("\nmessage: ", message));
+  ::testing::AssertionResult res =
+      Near(expected, actual, error, /*detailed_message=*/false);
+  if (!res) {
+    res << "Expected: " << TruncateHugeLiteral(expected) << "\n";
+    res << "Actual: " << TruncateHugeLiteral(actual) << "\n";
+    if (!message.empty()) {
+      res << StrCat("\nmessage: ", message);
+    }
+  }
+  EXPECT_TRUE(res);
 }
 
 /*static*/ ::testing::AssertionResult LiteralTestUtil::NearOrEqual(
@@ -754,8 +915,7 @@ void NearComparator::UpdateAndLogMiscompares<half>(half expected, half actual,
 
 /* static */ string LiteralTestUtil::MultiIndexAsString(
     tensorflow::gtl::ArraySlice<int64> multi_index) {
-  return tensorflow::strings::StrCat(
-      "{", tensorflow::str_util::Join(multi_index, ","), "}");
+  return StrCat("{", tensorflow::str_util::Join(multi_index, ","), "}");
 }
 
 /* static */ std::unique_ptr<Literal> LiteralTestUtil::Reshape(
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h
index 7b757a4bd7..a755568c0f 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.h
+++ b/tensorflow/compiler/xla/tests/literal_test_util.h
@@ -122,16 +122,19 @@ class LiteralTestUtil {
   // bounds are equivalent.
   //
   // Tuples are matched recursively.  When comparing tensors of
-  // non-floating-point type, checks for exact equality, ignoring the ErroSpec.
+  // non-floating-point type, checks for exact equality, ignoring the ErrorSpec.
   //
   // If the shape of the literals is neither a complex/floating-point tensor nor
   // a tuple which contains a complex/floating-point tensor, Near() is
   // equivalent to Equal().  We don't raise an error in this case, because we
   // want to allow callers to call Near() even if they have no preconceptions
   // about the shapes being compared.
+  //
+  // If detailed_message is true, then the error message in the assertion result
+  // will contain a more detailed breakdown of mismatches.
   static ::testing::AssertionResult Near(
-      const Literal& expected, const Literal& actual,
-      const ErrorSpec& error) TF_MUST_USE_RESULT;
+      const Literal& expected, const Literal& actual, const ErrorSpec& error,
+      bool detailed_message = false) TF_MUST_USE_RESULT;
 
   // Expects expected and actual to be Near with the given error.
   static void ExpectNear(const Literal& expected, const Literal& actual,
diff --git a/tensorflow/compiler/xla/tests/literal_test_util_test.cc b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
index 3a421f8458..9d619a77c7 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
@@ -89,7 +89,7 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) {
       EXPECT_EQ("2", literal->ToString());
     } else if (result.find("actual") != string::npos) {
       EXPECT_EQ("4", literal->ToString());
-    } else if (result.find("miscompares") != string::npos) {
+    } else if (result.find("mismatches") != string::npos) {
       EXPECT_EQ("true", literal->ToString());
     } else {
       FAIL() << "unknown file in temporary directory: " << result;
-- 
GitLab


From e07c9e23a94866966aa7e336a519b55931d570e3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 16:53:14 -0700
Subject: [PATCH 187/450] Run EvaluateNodes for ModelPruner test except for
 NoPruning.

PiperOrigin-RevId: 193596812
---
 tensorflow/core/grappler/optimizers/BUILD     |  1 +
 .../grappler/optimizers/model_pruner_test.cc  | 52 +++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 63492e1a7f..a371186fe6 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -365,6 +365,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:devices",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
diff --git a/tensorflow/core/grappler/optimizers/model_pruner_test.cc b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
index 2b12eadec9..cf5b990377 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner_test.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/utils.h"
@@ -133,6 +134,13 @@ TEST_F(ModelPrunerTest, IdentityPruning) {
   EXPECT_EQ(NodeName(b.name()), new_d.input(0));
   EXPECT_EQ(1, new_c.input_size());
   EXPECT_EQ(NodeName(b.name()), new_c.input(0));
+
+  std::vector<string> fetch = {"e"};
+  auto expected_tensors = EvaluateNodes(item.graph, fetch);
+  auto actual_tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, expected_tensors.size());
+  EXPECT_EQ(1, actual_tensors.size());
+  test::ExpectTensorEqual<float>(expected_tensors[0], actual_tensors[0]);
 }
 
 TEST_F(ModelPrunerTest, NoOpPruning) {
@@ -171,6 +179,13 @@ TEST_F(ModelPrunerTest, NoOpPruning) {
       EXPECT_EQ("a", new_node.input(0));
     }
   }
+
+  std::vector<string> fetch = {"e"};
+  auto expected_tensors = EvaluateNodes(item.graph, fetch);
+  auto actual_tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, expected_tensors.size());
+  EXPECT_EQ(1, actual_tensors.size());
+  test::ExpectTensorEqual<float>(expected_tensors[0], actual_tensors[0]);
 }
 
 TEST_F(ModelPrunerTest, PreserveIdentities) {
@@ -201,6 +216,19 @@ TEST_F(ModelPrunerTest, PreserveIdentities) {
 
   TF_EXPECT_OK(status);
   EXPECT_EQ(item.graph.node_size(), output.node_size());
+
+  auto v_in_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3}));
+  Tensor v_ctrl_t(DT_BOOL, TensorShape({}));
+  v_ctrl_t.flat<bool>()(0) = true;
+  auto expected_tensors = EvaluateNodes(
+      item.graph, {"merge", "id2"}, {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  auto actual_tensors = EvaluateNodes(output, {"merge", "id2"},
+                                      {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, expected_tensors.size());
+  EXPECT_EQ(2, actual_tensors.size());
+  for (int i = 0; i < expected_tensors.size(); i++) {
+    test::ExpectTensorEqual<float>(expected_tensors[i], actual_tensors[i]);
+  }
 }
 
 TEST_F(ModelPrunerTest, PruningSkipsRefOutputs) {
@@ -241,6 +269,14 @@ TEST_F(ModelPrunerTest, PruningSkipsRefOutputs) {
   EXPECT_EQ("b", new_c.input(0));
   EXPECT_EQ("b", new_d.input(0));
   EXPECT_EQ("b", new_e.input(0));
+
+  std::vector<string> fetch = {"e"};
+  auto a_t = GenerateRandomTensor<DT_INT64>(TensorShape({}));
+  auto expected_tensors = EvaluateNodes(item.graph, fetch, {{"a", a_t}});
+  auto actual_tensors = EvaluateNodes(output, fetch, {{"a", a_t}});
+  EXPECT_EQ(1, expected_tensors.size());
+  EXPECT_EQ(1, actual_tensors.size());
+  test::ExpectTensorEqual<int64>(expected_tensors[0], actual_tensors[0]);
 }
 
 // TODO(rmlarsen): Reenable this test when the issues with
@@ -316,6 +352,12 @@ TEST_F(ModelPrunerTest, PruningPerservesFetch) {
   EXPECT_EQ(NodeName(b.name()), new_b.name());
   const NodeDef& new_c = output.node(2);
   EXPECT_EQ(NodeName(c.name()), new_c.name());
+
+  auto expected_tensors = EvaluateNodes(item.graph, item.fetch);
+  auto actual_tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, expected_tensors.size());
+  EXPECT_EQ(1, actual_tensors.size());
+  test::ExpectTensorEqual<float>(expected_tensors[0], actual_tensors[0]);
 }
 
 TEST_F(ModelPrunerTest, PruningPerservesCrossDeviceIdentity) {
@@ -348,6 +390,16 @@ TEST_F(ModelPrunerTest, PruningPerservesCrossDeviceIdentity) {
       EXPECT_EQ("c", node.input(0));
     }
   }
+  if (GetNumAvailableGPUs() > 0) {
+    auto expected_tensors = EvaluateNodes(item.graph, item.fetch);
+    auto actual_tensors = EvaluateNodes(output, item.fetch);
+    EXPECT_EQ(4, expected_tensors.size());
+    EXPECT_EQ(4, actual_tensors.size());
+    for (int i = 0; i < expected_tensors.size(); i++) {
+      test::ExpectTensorNear<float>(expected_tensors[i], actual_tensors[i],
+                                    1e-6);
+    }
+  }
 }
 
 }  // namespace
-- 
GitLab


From 2d8da1d12a5fbeaa99e1cdd761b735a02020611b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 17:17:05 -0700
Subject: [PATCH 188/450] Removed deprecated methods from
 tensorflow::StringPiece.

This will allow tensorflow::StringPiece to be more easily replaced with absl::string_view as absl::string_view does not contain those methods.

PiperOrigin-RevId: 193599651
---
 tensorflow/core/lib/core/stringpiece.cc      |  4 ---
 tensorflow/core/lib/core/stringpiece.h       | 26 --------------------
 tensorflow/core/lib/core/stringpiece_test.cc | 10 --------
 3 files changed, 40 deletions(-)

diff --git a/tensorflow/core/lib/core/stringpiece.cc b/tensorflow/core/lib/core/stringpiece.cc
index 0b006fa2b4..4c488066e4 100644
--- a/tensorflow/core/lib/core/stringpiece.cc
+++ b/tensorflow/core/lib/core/stringpiece.cc
@@ -25,10 +25,6 @@ std::ostream& operator<<(std::ostream& o, StringPiece piece) {
   return o;
 }
 
-bool StringPiece::contains(StringPiece s) const {
-  return std::search(begin(), end(), s.begin(), s.end()) != end();
-}
-
 size_t StringPiece::find(char c, size_t pos) const {
   if (pos >= size_) {
     return npos;
diff --git a/tensorflow/core/lib/core/stringpiece.h b/tensorflow/core/lib/core/stringpiece.h
index 835b938cbf..0cf6c24850 100644
--- a/tensorflow/core/lib/core/stringpiece.h
+++ b/tensorflow/core/lib/core/stringpiece.h
@@ -88,20 +88,6 @@ class StringPiece {
 
   size_t find(char c, size_t pos = 0) const;
   size_t rfind(char c, size_t pos = npos) const;
-  // DEPRECATED: Use tensorflow::str_util::StrContains instead.
-  bool contains(StringPiece s) const;
-
-  // Checks whether StringPiece starts with x and if so advances the beginning
-  // of it to past the match.  It's basically a shortcut for starts_with
-  // followed by remove_prefix.
-  // DEPRECATED: Use tensorflow::str_util::ConsumePrefix instead.
-  bool Consume(StringPiece x) {
-    if (starts_with(x)) {
-      remove_prefix(x.size_);
-      return true;
-    }
-    return false;
-  }
 
   StringPiece substr(size_t pos, size_t n = npos) const;
 
@@ -114,18 +100,6 @@ class StringPiece {
   //   >  0 iff "*this" >  "b"
   int compare(StringPiece b) const;
 
-  // Return true iff "x" is a prefix of "*this"
-  // DEPRECATED: Use tensorflow::str_util::StartsWith instead.
-  bool starts_with(StringPiece x) const {
-    return ((size_ >= x.size_) && (memcmp(data_, x.data_, x.size_) == 0));
-  }
-  // Return true iff "x" is a suffix of "*this"
-  // DEPRECATED: Use tensorflow::str_util::EndsWith instead.
-  bool ends_with(StringPiece x) const {
-    return ((size_ >= x.size_) &&
-            (memcmp(data_ + (size_ - x.size_), x.data_, x.size_) == 0));
-  }
-
  private:
   const char* data_;
   size_t size_;
diff --git a/tensorflow/core/lib/core/stringpiece_test.cc b/tensorflow/core/lib/core/stringpiece_test.cc
index d0dbeb6072..de35d6eac6 100644
--- a/tensorflow/core/lib/core/stringpiece_test.cc
+++ b/tensorflow/core/lib/core/stringpiece_test.cc
@@ -55,14 +55,4 @@ TEST(StringPiece, Ctor) {
   }
 }
 
-TEST(StringPiece, Contains) {
-  StringPiece a("abcdefg");
-  StringPiece b("abcd");
-  StringPiece c("efg");
-  StringPiece d("gh");
-  EXPECT_TRUE(a.contains(b));
-  EXPECT_TRUE(a.contains(c));
-  EXPECT_TRUE(!a.contains(d));
-}
-
 }  // namespace tensorflow
-- 
GitLab


From 4e17a3f1496b398afe632b002b0589b7346b2e3f Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 19 Apr 2018 17:18:10 -0700
Subject: [PATCH 189/450] [XLA] De-unique_ptr-ify ShapedBuffer and
 ScopedShapedBuffer.

These are already notionally equivalent to T* and unique_ptr<T>, so
having a unique_ptr of a {Scoped,}ShapedBuffer is pretty redundant.

Also clean up the ScopedShapedBuffer API a bit.

PiperOrigin-RevId: 193599773
---
 tensorflow/compiler/jit/xla_launch_util.cc    |  47 ++---
 tensorflow/compiler/jit/xla_launch_util.h     |   2 +-
 tensorflow/compiler/jit/xla_tensor.cc         |   6 +-
 tensorflow/compiler/jit/xla_tensor.h          |   6 +-
 .../compiler/xla/client/local_client.cc       |  23 ++-
 tensorflow/compiler/xla/client/local_client.h |   6 +-
 .../xla/python/local_computation_builder.cc   |  46 ++---
 .../xla/python/local_computation_builder.h    |   6 +-
 .../xla/service/allocation_tracker.cc         |  33 ++--
 .../compiler/xla/service/allocation_tracker.h |  14 +-
 .../xla/service/cpu/cpu_executable.cc         |  14 +-
 .../compiler/xla/service/cpu/cpu_executable.h |   8 +-
 .../service/cpu/parallel_cpu_executable.cc    |   9 +-
 .../xla/service/cpu/parallel_cpu_executable.h |   4 +-
 tensorflow/compiler/xla/service/executable.cc |  16 +-
 tensorflow/compiler/xla/service/executable.h  |   8 +-
 .../xla/service/gpu/gpu_executable.cc         |  10 +-
 .../compiler/xla/service/gpu/gpu_executable.h |   4 +-
 tensorflow/compiler/xla/service/hlo_runner.cc |  45 +++--
 .../xla/service/interpreter/executable.cc     |   9 +-
 .../xla/service/interpreter/executable.h      |   4 +-
 tensorflow/compiler/xla/service/service.cc    |  14 +-
 .../compiler/xla/service/shaped_buffer.cc     |  36 ++--
 .../compiler/xla/service/shaped_buffer.h      |  64 ++++---
 .../compiler/xla/service/transfer_manager.cc  |  21 ++-
 .../compiler/xla/service/transfer_manager.h   |   8 +-
 .../compiler/xla/tests/dynamic_ops_test.cc    |   8 +-
 tensorflow/compiler/xla/tests/fusion_test.cc  |  16 +-
 .../xla/tests/local_client_allocation_test.cc |   7 +-
 .../xla/tests/local_client_execute_test.cc    | 170 ++++++++----------
 .../xla/tests/local_client_test_base.cc       |  12 +-
 .../xla/tests/local_client_test_base.h        |  11 +-
 .../xla/tests/transfer_manager_test.cc        |  46 ++---
 .../xla/tests/xla_hlo_profile_test.cc         |  10 +-
 34 files changed, 373 insertions(+), 370 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 50b0061d69..3520501c1a 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -32,10 +32,13 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
+namespace {
 namespace gpu = perftools::gputools;
+using xla::ScopedShapedBuffer;
+using xla::ShapedBuffer;
+}  // anonymous namespace
 
 namespace tensorflow {
-
 std::map<int, OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
                                                         int num_variables) {
   std::map<int, OptionalTensor> snapshot;
@@ -80,17 +83,17 @@ namespace {
 // Return the 'index''th subtree of the given ShapedBuffer as a
 // ScopedShapedBuffer. The returned ScopedShapedBuffer takes ownership of the
 // subtree, and sets the input's buffer pointers to nullptr for the subtree.
-std::unique_ptr<xla::ScopedShapedBuffer> ExtractSubShapedBuffer(
-    xla::ShapedBuffer* shaped_buffer, int index,
+ScopedShapedBuffer ExtractSubShapedBuffer(
+    ShapedBuffer* shaped_buffer, int index,
     xla::DeviceMemoryAllocator* allocator) {
   xla::Shape on_host_shape = xla::ShapeUtil::GetTupleElementShape(
       shaped_buffer->on_host_shape(), index);
   xla::Shape on_device_shape = xla::ShapeUtil::GetTupleElementShape(
       shaped_buffer->on_device_shape(), index);
 
-  xla::ShapedBuffer sub_shaped_buffer(on_host_shape, on_device_shape,
-                                      shaped_buffer->platform(),
-                                      shaped_buffer->device_ordinal());
+  ShapedBuffer sub_shaped_buffer(on_host_shape, on_device_shape,
+                                 shaped_buffer->platform(),
+                                 shaped_buffer->device_ordinal());
 
   auto& shape_tree = shaped_buffer->buffers();
   auto& sub_shape_tree = sub_shaped_buffer.buffers();
@@ -102,8 +105,7 @@ std::unique_ptr<xla::ScopedShapedBuffer> ExtractSubShapedBuffer(
       index_to_buffer.second = gpu::DeviceMemoryBase(nullptr, 0);
     }
   }
-  return xla::ScopedShapedBuffer::MakeScoped(&sub_shaped_buffer, allocator)
-      .ValueOrDie();
+  return ScopedShapedBuffer(std::move(sub_shaped_buffer), allocator);
 }
 }  // namespace
 
@@ -118,10 +120,10 @@ XlaComputationLaunchContext::XlaComputationLaunchContext(
 void XlaComputationLaunchContext::PopulateInputs(
     OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel,
     const std::map<int, OptionalTensor>& variables) {
-  // Build xla::ShapedBuffers that point directly to the Tensor buffers.
+  // Build ShapedBuffers that point directly to the Tensor buffers.
   arg_buffers_.reserve(kernel->xla_input_shapes.size() + 1);
   arg_buffers_.resize(kernel->xla_input_shapes.size());
-  arg_ptrs_ = std::vector<xla::ShapedBuffer*>(arg_buffers_.size());
+  arg_ptrs_ = std::vector<ShapedBuffer*>(arg_buffers_.size());
 
   // Pass remaining parameters.
   const Tensor* t;
@@ -140,8 +142,7 @@ void XlaComputationLaunchContext::PopulateInputs(
     if (xla::ShapeUtil::IsTuple(on_device_shape)) {
       const XlaTensor* xla_tensor = XlaTensor::FromTensor(t);
       CHECK(xla_tensor && xla_tensor->has_shaped_buffer());
-      arg_ptrs_[i] =
-          const_cast<xla::ShapedBuffer*>(&xla_tensor->shaped_buffer());
+      arg_ptrs_[i] = const_cast<ShapedBuffer*>(&xla_tensor->shaped_buffer());
     } else {
       CHECK(xla::ShapeUtil::Equal(shape, on_device_shape))
           << "On-device shape "
@@ -149,7 +150,7 @@ void XlaComputationLaunchContext::PopulateInputs(
           << " not the same as on-host shape "
           << xla::ShapeUtil::HumanStringWithLayout(shape);
       gpu::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t);
-      arg_buffers_[i] = xla::MakeUnique<xla::ShapedBuffer>(
+      arg_buffers_[i] = xla::MakeUnique<ShapedBuffer>(
           /*on_host_shape=*/shape, /*on_device_shape=*/shape,
           client_->platform(), client_->default_device_ordinal());
       arg_buffers_[i]->set_buffer(dmem, /*index=*/{});
@@ -160,15 +161,15 @@ void XlaComputationLaunchContext::PopulateInputs(
 
 void XlaComputationLaunchContext::PopulateOutputs(
     OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel,
-    std::unique_ptr<xla::ScopedShapedBuffer> output) {
+    ScopedShapedBuffer output) {
   gpu::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
 
   // Computation output should always be a tuple.
   if (VLOG_IS_ON(2)) {
-    VLOG(2) << "Result tuple shape: " << output->on_host_shape().DebugString();
+    VLOG(2) << "Result tuple shape: " << output.on_host_shape().DebugString();
     VLOG(2) << "Result tuple shape (on device): "
-            << output->on_device_shape().DebugString();
+            << output.on_device_shape().DebugString();
   }
   CHECK_EQ(ctx->num_outputs(), kernel->outputs.size());
 
@@ -226,18 +227,18 @@ void XlaComputationLaunchContext::PopulateOutputs(
       const TensorShape& shape = kernel->outputs[i].shape;
       VLOG(2) << "Retval " << i << " shape " << shape.DebugString();
 
-      gpu::DeviceMemoryBase buffer = output->buffer({output_num});
+      gpu::DeviceMemoryBase buffer = output.buffer({output_num});
       if (allocate_xla_tensors_) {
         Tensor* output_tensor;
         OP_REQUIRES_OK(ctx, ctx->allocate_output(i, shape, &output_tensor));
         XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor);
         CHECK(xla_tensor);
-        xla_tensor->set_shaped_buffer(
-            ExtractSubShapedBuffer(output.get(), output_num, xla_allocator_));
+        xla_tensor->set_shaped_buffer(ScopedShapedBuffer(
+            ExtractSubShapedBuffer(&output, output_num, xla_allocator_)));
       } else {
         Tensor output_tensor = XlaTensorBuffer::MakeTensor(
             ctx->expected_output_dtype(i), shape, buffer, allocator);
-        output->set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num});
+        output.set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num});
         ctx->set_output(i, output_tensor);
       }
       ++output_num;
@@ -257,7 +258,7 @@ void XlaComputationLaunchContext::PopulateOutputs(
                 write.input_index >= 0 && write.input_index < ctx->num_inputs(),
                 errors::Internal("Invalid input index for variable write."));
 
-    gpu::DeviceMemoryBase buffer = output->buffer({output_num});
+    gpu::DeviceMemoryBase buffer = output.buffer({output_num});
 
     Var* variable = nullptr;
     // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor,
@@ -282,12 +283,12 @@ void XlaComputationLaunchContext::PopulateOutputs(
       XlaTensor* xla_tensor = XlaTensor::FromTensor(&output_tensor);
       CHECK(xla_tensor);
       xla_tensor->set_shaped_buffer(
-          ExtractSubShapedBuffer(output.get(), output_num, xla_allocator_));
+          ExtractSubShapedBuffer(&output, output_num, xla_allocator_));
       *variable->tensor() = output_tensor;
     } else {
       Tensor output_tensor = XlaTensorBuffer::MakeTensor(
           write.type, write.shape, buffer, allocator);
-      output->set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num});
+      output.set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num});
       *variable->tensor() = output_tensor;
     }
     ++output_num;
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 14f70fe358..26dcaa8a51 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -87,7 +87,7 @@ class XlaComputationLaunchContext {
   // Given the XLA output in `output`, populate all outputs of `ctx`.
   void PopulateOutputs(OpKernelContext* ctx,
                        const XlaCompiler::CompilationResult* kernel,
-                       std::unique_ptr<xla::ScopedShapedBuffer> output);
+                       xla::ScopedShapedBuffer output);
 
   // Return the argument list. Only valid after PopulateInputs() has been
   // called.
diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
index 956328e675..84b2835c40 100644
--- a/tensorflow/compiler/jit/xla_tensor.cc
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -65,10 +65,8 @@ Status XlaTensor::AllocateShapedBuffer(DataType dtype, const TensorShape& shape,
                             device_ordinal, size, /*retry_on_failure=*/false));
   }
 
-  TF_ASSIGN_OR_RETURN(auto scoped_buffer,
-                      xla::ScopedShapedBuffer::MakeScoped(
-                          &buffer, client->backend().memory_allocator()));
-  set_shaped_buffer(std::move(scoped_buffer));
+  set_shaped_buffer(xla::ScopedShapedBuffer(
+      std::move(buffer), client->backend().memory_allocator()));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h
index 5ff2fb08f0..2334fd272b 100644
--- a/tensorflow/compiler/jit/xla_tensor.h
+++ b/tensorflow/compiler/jit/xla_tensor.h
@@ -64,9 +64,9 @@ class XlaTensor {
     return *shaped_buffer_;
   }
   // Mutates the TensorInfo to set the ShapedBuffer.
-  void set_shaped_buffer(
-      std::unique_ptr<xla::ScopedShapedBuffer> shaped_buffer) {
-    shaped_buffer_ = std::move(shaped_buffer);
+  void set_shaped_buffer(xla::ScopedShapedBuffer shaped_buffer) {
+    shaped_buffer_ =
+        xla::MakeUnique<xla::ScopedShapedBuffer>(std::move(shaped_buffer));
   }
 
   // Some tensors on the device may have known values on the host. We use these
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index d951c44cb9..d0e945b70f 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -134,7 +134,7 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
   return Status::OK();
 }
 
-StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::Run(
+StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     ExecutableRunOptions run_options) {
   TF_RETURN_IF_ERROR(
@@ -167,27 +167,26 @@ StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::Run(
     return ExecuteAndDump(&service_options, arguments);
   }
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> result,
+      ShapedBuffer result,
       executable_->ExecuteOnStreamWrapper(
           &service_options, run_options.execution_profile(), arguments));
 
-  return MakeUnique<ScopedShapedBuffer>(std::move(*result),
-                                        run_options.allocator());
+  return ScopedShapedBuffer(std::move(result), run_options.allocator());
 }
 
-StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::ExecuteAndDump(
+StatusOr<ScopedShapedBuffer> LocalExecutable::ExecuteAndDump(
     const ServiceExecutableRunOptions* run_options,
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   executable_->session_module()->set_execution_platform(
       backend_->platform()->Name());
   TF_RETURN_IF_ERROR(RecordArguments(arguments, executable_->session_module()));
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> result,
+      ShapedBuffer result,
       executable_->ExecuteOnStream(run_options, arguments,
                                    /*hlo_execution_profile=*/nullptr));
-  TF_RETURN_IF_ERROR(RecordResult(result.get(), executable_->session_module()));
+  TF_RETURN_IF_ERROR(RecordResult(&result, executable_->session_module()));
   TF_RETURN_IF_ERROR(executable_->DumpSessionModule());
-  return ScopedShapedBuffer::MakeScoped(result.get(), run_options->allocator());
+  return ScopedShapedBuffer(std::move(result), run_options->allocator());
 }
 
 tensorflow::Status LocalExecutable::RecordArguments(
@@ -281,9 +280,9 @@ StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
                                         updated_options));
 }
 
-StatusOr<std::unique_ptr<ScopedShapedBuffer>>
-LocalClient::LiteralToShapedBuffer(const Literal& literal, int device_ordinal,
-                                   DeviceMemoryAllocator* allocator) {
+StatusOr<ScopedShapedBuffer> LocalClient::LiteralToShapedBuffer(
+    const Literal& literal, int device_ordinal,
+    DeviceMemoryAllocator* allocator) {
   if (allocator == nullptr) {
     allocator = backend().memory_allocator();
   }
@@ -293,7 +292,7 @@ LocalClient::LiteralToShapedBuffer(const Literal& literal, int device_ordinal,
   TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
                       backend().stream_executor(device_ordinal));
   TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
-      executor, literal, *scoped_buffer));
+      executor, literal, scoped_buffer));
   return std::move(scoped_buffer);
 }
 
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 42812b936f..f306c520ed 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -38,7 +38,7 @@ class LocalExecutable {
  public:
   // Run the compiled computation with the given arguments and options and
   // return the result.
-  StatusOr<std::unique_ptr<ScopedShapedBuffer>> Run(
+  StatusOr<ScopedShapedBuffer> Run(
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       ExecutableRunOptions run_options);
 
@@ -73,7 +73,7 @@ class LocalExecutable {
 
   // Records the computation in a SessionModule proto with the arguments used to
   // invoke it, and the result. Enabled by flag: --tla_dump_executions_to.
-  StatusOr<std::unique_ptr<ScopedShapedBuffer>> ExecuteAndDump(
+  StatusOr<ScopedShapedBuffer> ExecuteAndDump(
       const ServiceExecutableRunOptions* run_options,
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments);
 
@@ -136,7 +136,7 @@ class LocalClient : public Client {
   // ScopedShapedBuffer. If non-null the given memory allocator is used for
   // device memory allocation. If null, the default memory allocator for the
   // device is used.
-  StatusOr<std::unique_ptr<ScopedShapedBuffer>> LiteralToShapedBuffer(
+  StatusOr<ScopedShapedBuffer> LiteralToShapedBuffer(
       const Literal& literal, int device_ordinal,
       DeviceMemoryAllocator* allocator = nullptr);
 
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index 2bacc6a914..24e17abbe0 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -89,17 +89,16 @@ StatusOr<std::unique_ptr<Literal>> TransferFromOutfeedLocalReplica(
   return client->TransferFromOutfeedLocal(shape, device_ordinal);
 }
 
-LocalShapedBuffer::LocalShapedBuffer(
-    std::unique_ptr<ScopedShapedBuffer> shaped_buffer)
+LocalShapedBuffer::LocalShapedBuffer(ScopedShapedBuffer shaped_buffer)
     : shaped_buffer_(std::move(shaped_buffer)) {}
 
-const std::unique_ptr<ScopedShapedBuffer>& LocalShapedBuffer::shaped_buffer()
-    const {
-  return shaped_buffer_;
+const ScopedShapedBuffer* LocalShapedBuffer::shaped_buffer() const {
+  return &shaped_buffer_;
 }
 
-static StatusOr<std::unique_ptr<ScopedShapedBuffer>> ToBuffer(
-    LocalClient* client, int device_ordinal, const Literal& arg) {
+static StatusOr<ScopedShapedBuffer> ToBuffer(LocalClient* client,
+                                             int device_ordinal,
+                                             const Literal& arg) {
   return client->LiteralToShapedBuffer(arg, device_ordinal,
                                        client->backend().memory_allocator());
 }
@@ -109,14 +108,15 @@ LocalShapedBuffer* LocalShapedBuffer::FromLiteral(
     const Literal& argument,
     const tensorflow::gtl::optional<Shape>& shape_with_layout) {
   LocalClient* client = GetOrCreateLocalClient();
-  std::unique_ptr<ScopedShapedBuffer> buf;
-  if (shape_with_layout) {
-    std::unique_ptr<Literal> relaid =
-        argument.Relayout(shape_with_layout.value());
-    buf = ToBuffer(client, /*device_ordinal=*/0, *relaid).ConsumeValueOrDie();
-  } else {
-    buf = ToBuffer(client, /*device_ordinal=*/0, argument).ConsumeValueOrDie();
-  }
+  ScopedShapedBuffer buf = [&] {
+    if (shape_with_layout) {
+      std::unique_ptr<Literal> relaid =
+          argument.Relayout(shape_with_layout.value());
+      return ToBuffer(client, /*device_ordinal=*/0, *relaid)
+          .ConsumeValueOrDie();
+    }
+    return ToBuffer(client, /*device_ordinal=*/0, argument).ConsumeValueOrDie();
+  }();
   return new LocalShapedBuffer(std::move(buf));
 }
 
@@ -158,14 +158,14 @@ StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
                 << device_ordinal;
 
         // Transfer arguments in
-        std::vector<std::unique_ptr<ScopedShapedBuffer>> scoped_buffers;
+        std::vector<ScopedShapedBuffer> scoped_buffers;
         scoped_buffers.reserve(arguments.size());
         for (int i = 0; i < arguments.size(); ++i) {
           const Literal& argument = arguments[i];
           const tensorflow::gtl::optional<Shape>& shape_with_layout =
               shapes_with_layout[i];
 
-          StatusOr<std::unique_ptr<ScopedShapedBuffer>> pushed;
+          StatusOr<ScopedShapedBuffer> pushed;
           if (shape_with_layout) {
             std::unique_ptr<Literal> relaid =
                 argument.Relayout(shape_with_layout.value());
@@ -185,7 +185,7 @@ StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
         std::vector<const ShapedBuffer*> argument_buffers;
         argument_buffers.reserve(scoped_buffers.size());
         for (auto& buffer : scoped_buffers) {
-          argument_buffers.push_back(buffer.get());
+          argument_buffers.push_back(&buffer);
         }
 
         DeviceAssignment device_assignment =
@@ -202,7 +202,7 @@ StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
         options.set_intra_op_thread_pool(
             client->backend().eigen_intra_op_thread_pool_device());
         options.set_device_assignment(&device_assignment);
-        StatusOr<std::unique_ptr<ScopedShapedBuffer>> result_buffer_status =
+        StatusOr<ScopedShapedBuffer> result_buffer_status =
             executable_->Run(argument_buffers, options);
         if (!result_buffer_status.ok()) {
           results[replica] = result_buffer_status.status();
@@ -210,8 +210,8 @@ StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
         }
 
         // Transfer result out
-        results[replica] =
-            client->ShapedBufferToLiteral(*result_buffer_status.ValueOrDie());
+        results[replica] = client->ShapedBufferToLiteral(
+            std::move(result_buffer_status).ValueOrDie());
       });
     }
   }
@@ -236,7 +236,7 @@ LocalShapedBuffer* CompiledLocalComputation::ExecuteWithShapedBuffers(
   std::vector<const ShapedBuffer*> argument_buffers;
   argument_buffers.reserve(argument_handles.size());
   for (auto& handle : argument_handles) {
-    argument_buffers.push_back(handle->shaped_buffer().get());
+    argument_buffers.push_back(handle->shaped_buffer());
   }
 
   // Execute
@@ -245,7 +245,7 @@ LocalShapedBuffer* CompiledLocalComputation::ExecuteWithShapedBuffers(
   options.set_inter_op_thread_pool(client->backend().inter_op_thread_pool());
   options.set_intra_op_thread_pool(
       client->backend().eigen_intra_op_thread_pool_device());
-  std::unique_ptr<ScopedShapedBuffer> result_buffer =
+  ScopedShapedBuffer result_buffer =
       executable_->Run(argument_buffers, options).ConsumeValueOrDie();
 
   return new LocalShapedBuffer(std::move(result_buffer));
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index 31046e60f1..e1048909ab 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -62,12 +62,12 @@ class LocalShapedBuffer {
   static LocalShapedBuffer* FromLiteral(
       const Literal& argument,
       const tensorflow::gtl::optional<Shape>& shape_with_layout);
-  LocalShapedBuffer(std::unique_ptr<ScopedShapedBuffer> shaped_buffer);
-  const std::unique_ptr<ScopedShapedBuffer>& shaped_buffer() const;
+  LocalShapedBuffer(ScopedShapedBuffer shaped_buffer);
+  const ScopedShapedBuffer* shaped_buffer() const;
   std::unique_ptr<Literal> ToLiteral() const;
 
  private:
-  std::unique_ptr<ScopedShapedBuffer> shaped_buffer_;
+  ScopedShapedBuffer shaped_buffer_;
 };
 
 // Wraps a LocalExecutable produced by compiling a
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index 359582a78c..6bf65825cd 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -31,52 +31,51 @@ limitations under the License.
 namespace xla {
 
 StatusOr<GlobalDataHandle> AllocationTracker::Register(
-    std::unique_ptr<ShapedBuffer> shaped_buffer, const string& tag) {
+    ShapedBuffer shaped_buffer, const string& tag) {
   tensorflow::mutex_lock lock(mutex_);
   VLOG(2) << "Register";
-  std::vector<std::unique_ptr<ShapedBuffer>> replicated_buffers;
+  std::vector<ShapedBuffer> replicated_buffers;
   replicated_buffers.emplace_back(std::move(shaped_buffer));
   return RegisterInternal(std::move(replicated_buffers), tag);
 }
 
 StatusOr<GlobalDataHandle> AllocationTracker::RegisterReplicatedBuffers(
-    std::vector<std::unique_ptr<ShapedBuffer>> replicated_buffers,
-    const string& tag) {
+    std::vector<ShapedBuffer> replicated_buffers, const string& tag) {
   tensorflow::mutex_lock lock(mutex_);
   VLOG(2) << "RegisterReplicatedBuffers";
   return RegisterInternal(std::move(replicated_buffers), tag);
 }
 
 StatusOr<GlobalDataHandle> AllocationTracker::RegisterInternal(
-    std::vector<std::unique_ptr<ShapedBuffer>> replicated_buffers,
-    const string& tag) {
+    std::vector<ShapedBuffer> replicated_buffers, const string& tag) {
   VLOG(2) << "RegisterInternal("
           << "tag: \"" << tag << "\" with " << replicated_buffers.size()
           << " shaped_buffers.";
   for (const auto& shaped_buffer : replicated_buffers) {
-    VLOG(2) << "shaped_buffer:" << *shaped_buffer;
-    if (shaped_buffer->platform() != backend_->platform()) {
+    VLOG(2) << "shaped_buffer:" << shaped_buffer;
+    if (shaped_buffer.platform() != backend_->platform()) {
       return InvalidArgument(
           "AllocationTracker for platform %s cannot register buffer from "
           "platform %s",
           backend_->platform()->Name().c_str(),
-          shaped_buffer->platform()->Name().c_str());
+          shaped_buffer.platform()->Name().c_str());
     }
   }
 
   int64 handle = next_handle_++;
   for (auto& shaped_buffer : replicated_buffers) {
     std::vector<ShapeIndex> shape_indices;
-    ShapeUtil::ForEachSubshape(shaped_buffer->on_device_shape(),
+    ShapeUtil::ForEachSubshape(shaped_buffer.on_device_shape(),
                                [this, &shape_indices](const Shape& /*subshape*/,
                                                       const ShapeIndex& index) {
                                  shape_indices.push_back(index);
                                });
     for (const ShapeIndex& index : shape_indices) {
-      AddAllocationOrIncrementRefCount(shaped_buffer->buffer(index),
-                                       shaped_buffer->device_ordinal());
+      AddAllocationOrIncrementRefCount(shaped_buffer.buffer(index),
+                                       shaped_buffer.device_ordinal());
     }
-    handle_to_shaped_buffers_[handle].emplace_back(std::move(shaped_buffer));
+    handle_to_shaped_buffers_[handle].emplace_back(
+        MakeUnique<ShapedBuffer>(std::move(shaped_buffer)));
   }
 
   GlobalDataHandle result;
@@ -146,13 +145,13 @@ StatusOr<std::vector<GlobalDataHandle>> AllocationTracker::DeconstructTuple(
   for (int i = 0;
        i < ShapeUtil::TupleElementCount(shaped_buffer->on_device_shape());
        ++i) {
-    auto element_buffer = MakeUnique<ShapedBuffer>(
+    auto element_buffer = ShapedBuffer(
         ShapeUtil::GetTupleElementShape(shaped_buffer->on_host_shape(), i),
         ShapeUtil::GetTupleElementShape(shaped_buffer->on_device_shape(), i),
         shaped_buffer->platform(), shaped_buffer->device_ordinal());
-    element_buffer->set_buffer(shaped_buffer->buffer(/*index=*/{i}),
-                               /*index=*/{});
-    std::vector<std::unique_ptr<ShapedBuffer>> replicated_buffers;
+    element_buffer.set_buffer(shaped_buffer->buffer(/*index=*/{i}),
+                              /*index=*/{});
+    std::vector<ShapedBuffer> replicated_buffers;
     replicated_buffers.emplace_back(std::move(element_buffer));
     TF_ASSIGN_OR_RETURN(
         GlobalDataHandle element_handle,
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h
index 60e93358ef..2bfcd53712 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@@ -45,14 +45,13 @@ class AllocationTracker {
   // Registers a shaped buffer of device memory, and returns a corresponding
   // handle that can be used for talking to XLA clients. The given shaped buffer
   // will be treated as the buffer corresponding to the only replica.
-  StatusOr<GlobalDataHandle> Register(
-      std::unique_ptr<ShapedBuffer> shaped_buffer, const string& tag);
+  StatusOr<GlobalDataHandle> Register(ShapedBuffer shaped_buffer,
+                                      const string& tag);
 
   // Registers a vector of shaped buffers of device memory, one per replica, and
   // returns a corresponding handle that can be used for talking to XLA clients.
   StatusOr<GlobalDataHandle> RegisterReplicatedBuffers(
-      std::vector<std::unique_ptr<ShapedBuffer>> replicated_buffers,
-      const string& tag);
+      std::vector<ShapedBuffer> replicated_buffers, const string& tag);
 
   // Unregister the allocation for the given data handle.
   Status Unregister(const GlobalDataHandle& data);
@@ -95,8 +94,8 @@ class AllocationTracker {
   // Internal helper which registers a vector of shaped buffers, one per
   // replica.
   StatusOr<GlobalDataHandle> RegisterInternal(
-      std::vector<std::unique_ptr<ShapedBuffer>> replicated_buffers,
-      const string& tag) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+      std::vector<ShapedBuffer> replicated_buffers, const string& tag)
+      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Resets the shaped buffers corresponding to the given handle.
   Status Reset(const GlobalDataHandle& data) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
@@ -132,6 +131,9 @@ class AllocationTracker {
 
   // A map from data handle to a vector of shaped buffers that represent the
   // buffers for different replicas.
+  //
+  // The ShapedBuffers in this map's vectors need to be unique_ptrs, because our
+  // public API returns pointers to them.
   tensorflow::gtl::FlatMap<int64, std::vector<std::unique_ptr<ShapedBuffer>>>
       handle_to_shaped_buffers_ GUARDED_BY(mutex_);
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index aee62a4935..97e550abe4 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -243,18 +243,18 @@ static Status DeallocateTempBuffers(
   return Status::OK();
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::CreateResultShapedBuffer(
+StatusOr<ShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
     std::vector<bool>* buffers_in_result) {
   se::Stream* stream = run_options->stream();
-  auto result_buffer = MakeUnique<ShapedBuffer>(
+  ShapedBuffer result_buffer(
       /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(),
       stream->parent()->platform(), stream->parent()->device_ordinal());
 
   // Copy DeviceMemoryBase values which contain the array(s) of the result into
   // the respective location in ShapedBuffer which is returned to the caller.
-  TF_RETURN_IF_ERROR(result_buffer->buffers().ForEachMutableElementWithStatus(
+  TF_RETURN_IF_ERROR(result_buffer.buffers().ForEachMutableElementWithStatus(
       [&](const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
         const auto& sources = this->GetRootPointsToSet().element(index);
         // The points to set is unambiguous so the set should be a
@@ -281,7 +281,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::CreateResultShapedBuffer(
   return std::move(result_buffer);
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteOnStream(
+StatusOr<ShapedBuffer> CpuExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
@@ -300,7 +300,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteOnStream(
 
   std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> result_buffer,
+      ShapedBuffer result_buffer,
       CreateResultShapedBuffer(run_options, buffers, &buffers_in_result));
 
   // Free all buffers not in the result.
@@ -310,7 +310,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteOnStream(
   return std::move(result_buffer);
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteAsyncOnStream(
+StatusOr<ShapedBuffer> CpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   if (hlo_profiling_enabled()) {
@@ -330,7 +330,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteAsyncOnStream(
 
   std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> result_buffer,
+      ShapedBuffer result_buffer,
       CreateResultShapedBuffer(run_options, buffers, &buffers_in_result));
 
   LogLiveAddresses(buffers, buffers_in_result);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index c3c2820c26..06b6943cb5 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -55,12 +55,12 @@ class CpuExecutable : public Executable {
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
   ~CpuExecutable() override {}
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
+  StatusOr<ShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
+  StatusOr<ShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
@@ -102,13 +102,13 @@ class CpuExecutable : public Executable {
       tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
       HloExecutionProfile* hlo_execution_profile);
 
-  // Create a ShapedBuffer for holding the result of the computation. The
+  // Creates a ShapedBuffer for holding the result of the computation. The
   // addresses (DeviceMemoryBases) are set according to buffer assignment.
   // 'buffers_in_result' should point to a vector of the same size as
   // 'allocated_buffers'. An element in buffers_in_result is set to true if the
   // corresponding buffer is live out of the computation (and thus contained in
   // the returned ShapedBuffer).
-  StatusOr<std::unique_ptr<ShapedBuffer>> CreateResultShapedBuffer(
+  StatusOr<ShapedBuffer> CreateResultShapedBuffer(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
       std::vector<bool>* buffers_in_result);
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
index 2d0f1d0be5..a2bd4fa195 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@@ -447,7 +447,7 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
   return Status::OK();
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
+StatusOr<ShapedBuffer> ParallelCpuExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
@@ -459,7 +459,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
   DeviceMemoryAllocator* memory_allocator = run_options->allocator();
   std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
 
-  auto result_buffer = MakeUnique<ShapedBuffer>(
+  ShapedBuffer result_buffer(
       /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(),
       stream->parent()->platform(), stream->parent()->device_ordinal());
 
@@ -472,7 +472,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
   // Copy DeviceMemoryBase values which into the respective location in
   // ShapedBuffer which is returned to the caller.
   std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
-  TF_RETURN_IF_ERROR(result_buffer->buffers().ForEachMutableElementWithStatus(
+  TF_RETURN_IF_ERROR(result_buffer.buffers().ForEachMutableElementWithStatus(
       [&](const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
         const auto& sources = this->GetRootPointsToSet().element(index);
 
@@ -511,8 +511,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
   return std::move(result_buffer);
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>>
-ParallelCpuExecutable::ExecuteAsyncOnStream(
+StatusOr<ShapedBuffer> ParallelCpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   // TODO(b/30671675): Implement asynchronous execution mode.
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
index d87ba57a1e..5ce84fa996 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
@@ -59,12 +59,12 @@ class ParallelCpuExecutable : public Executable {
       std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
   ~ParallelCpuExecutable() override {}
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
+  StatusOr<ShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
+  StatusOr<ShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index caa46686be..b097ef79cc 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -29,18 +29,19 @@ using tensorflow::gtl::ArraySlice;
 
 namespace xla {
 
-StatusOr<std::vector<std::unique_ptr<ShapedBuffer>>>
-Executable::ExecuteOnStreams(
+StatusOr<std::vector<ShapedBuffer>> Executable::ExecuteOnStreams(
     ArraySlice<const ServiceExecutableRunOptions> run_options,
     ArraySlice<ArraySlice<const ShapedBuffer*>> arguments) {
   TF_RET_CHECK(run_options.size() == arguments.size());
 
-  std::vector<std::unique_ptr<ShapedBuffer>> return_values(run_options.size());
+  std::vector<ShapedBuffer> return_values;
+  return_values.reserve(run_options.size());
 
   if (run_options.size() == 1) {
-    TF_ASSIGN_OR_RETURN(return_values[0],
+    TF_ASSIGN_OR_RETURN(auto rv,
                         ExecuteOnStream(&run_options[0], arguments[0],
                                         /*hlo_execution_profile=*/nullptr));
+    return_values.push_back(std::move(rv));
     return std::move(return_values);
   }
 
@@ -48,8 +49,9 @@ Executable::ExecuteOnStreams(
     // We cannot BlockHostUntilDone() on the already-launched executions in case
     // of error, since if the executions communicate, the initially launched
     // executions may never complete if not all executions are running.
-    TF_ASSIGN_OR_RETURN(return_values[i],
+    TF_ASSIGN_OR_RETURN(auto rv,
                         ExecuteAsyncOnStream(&run_options[i], arguments[i]));
+    return_values.push_back(std::move(rv));
   }
   for (const auto& options : run_options) {
     TF_RET_CHECK(options.stream() != nullptr);
@@ -58,7 +60,7 @@ Executable::ExecuteOnStreams(
   return std::move(return_values);
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper(
+StatusOr<ShapedBuffer> Executable::ExecuteOnStreamWrapper(
     const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile,
     ArraySlice<const ShapedBuffer*> arguments) {
   se::Stream* stream = run_options->stream();
@@ -78,7 +80,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper(
                                             &hlo_profile_index_map())
           : nullptr;
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> return_value =
+  StatusOr<ShapedBuffer> return_value =
       ExecuteOnStream(run_options, arguments, profile_ptr.get());
   TF_RETURN_IF_ERROR(return_value.status());
 
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 6f4cd99767..9c725f21d8 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -62,14 +62,14 @@ class Executable {
   // enabled.
   //
   // Returns a shaped buffer containing the result of the computation.
-  virtual StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
+  virtual StatusOr<ShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) = 0;
 
   // Same as ExecuteOnStream(), but this call is non-blocking and returns as
   // soon as all of the operations are enqueued for launch on the stream.
-  virtual StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
+  virtual StatusOr<ShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) = 0;
 
@@ -77,7 +77,7 @@ class Executable {
   // streams. arguments[i] contains the arguments to the execution on
   // run_options[i]->stream() and the returned value is at index i of the
   // returned vector.
-  virtual StatusOr<std::vector<std::unique_ptr<ShapedBuffer>>> ExecuteOnStreams(
+  virtual StatusOr<std::vector<ShapedBuffer>> ExecuteOnStreams(
       tensorflow::gtl::ArraySlice<const ServiceExecutableRunOptions>
           run_options,
       tensorflow::gtl::ArraySlice<
@@ -97,7 +97,7 @@ class Executable {
   // Convenience wrapper for calling Executable::ExecuteOnStream. Sets up a
   // timer for the execution, sets up HLO profiling if enabled, and fills in the
   // given ExecutionProfile if non-null.
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStreamWrapper(
+  StatusOr<ShapedBuffer> ExecuteOnStreamWrapper(
       const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments);
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 5676d4de8e..62ce15bc59 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -250,7 +250,7 @@ Status GpuExecutable::ExecuteThunks(
   return Status::OK();
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
+StatusOr<ShapedBuffer> GpuExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
@@ -297,13 +297,13 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
 
   HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
   auto device_ordinal = executor->device_ordinal();
-  auto shaped_buffer = MakeUnique<ShapedBuffer>(
-      root->shape(), root->shape(), executor->platform(), device_ordinal);
+  auto shaped_buffer = ShapedBuffer(root->shape(), root->shape(),
+                                    executor->platform(), device_ordinal);
 
   // Copy DeviceMemoryBase values which contain the array(s) of the result into
   // the respective location in ShapedBuffer.
   std::set<se::DeviceMemoryBase> buffers_in_result;
-  TF_RETURN_IF_ERROR(shaped_buffer->buffers().ForEachMutableElementWithStatus(
+  TF_RETURN_IF_ERROR(shaped_buffer.buffers().ForEachMutableElementWithStatus(
       [&buffer_allocations, &buffers_in_result, &shaped_buffer, this](
           const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
         const auto& sources = this->GetRootPointsToSet().element(index);
@@ -335,7 +335,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
   return std::move(shaped_buffer);
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteAsyncOnStream(
+StatusOr<ShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   // TODO(b/30671675): Implement asynchronous execution mode.
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index dcb3991f41..361bc30b2f 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -74,12 +74,12 @@ class GpuExecutable : public Executable {
 
   // ExecuteOnStream will fail if the compute capability of the stream doesn't
   // match the compute capability passed to this object's constructor.
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
+  StatusOr<ShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
+  StatusOr<ShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 171477299e..df5ffd0b7d 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -107,33 +107,35 @@ StatusOr<std::unique_ptr<Literal>> HloRunner::Execute(
   const ExecutableRunOptions& run_options = service_run_options.run_options();
 
   // Copy arguments to device.
-  std::vector<std::unique_ptr<ScopedShapedBuffer>> argument_buffers;
-  std::vector<ShapedBuffer*> argument_buffer_ptrs;
+  std::vector<ScopedShapedBuffer> argument_buffers;
   for (Literal* argument : arguments) {
     TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<ScopedShapedBuffer> argument_buffer,
+        ScopedShapedBuffer argument_buffer,
         backend().transfer_manager()->AllocateScopedShapedBuffer(
             argument->shape(), run_options.allocator(),
             run_options.device_ordinal()));
     TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
-        stream.parent(), *argument, *argument_buffer));
+        stream.parent(), *argument, argument_buffer));
     argument_buffers.push_back(std::move(argument_buffer));
-    argument_buffer_ptrs.push_back(argument_buffers.back().get());
+  }
+
+  std::vector<const ShapedBuffer*> argument_buffer_ptrs;
+  argument_buffer_ptrs.reserve(argument_buffers.size());
+  for (const auto& buf : argument_buffers) {
+    argument_buffer_ptrs.push_back(&buf);
   }
 
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> result,
+      ShapedBuffer result,
       executable->ExecuteOnStreamWrapper(
           &service_run_options, /*profile=*/nullptr, argument_buffer_ptrs));
 
   // Create a ScopedShapedBuffer of the result to manage deallocation. This will
   // deallocate all the device memory when it goes out of scope.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ScopedShapedBuffer> scoped_result,
-      ScopedShapedBuffer::MakeScoped(result.get(), run_options.allocator()));
+  ScopedShapedBuffer scoped_result(std::move(result), run_options.allocator());
 
   auto result_literal = backend().transfer_manager()->TransferLiteralFromDevice(
-      stream.parent(), *scoped_result);
+      stream.parent(), scoped_result);
   if (result_literal.ok()) {
     VLOG(4) << "Executed binary and got result: "
             << result_literal.ValueOrDie()->ToString();
@@ -155,7 +157,13 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
       backend().computation_placer()->AssignDevices(options.num_replicas, 1));
   std::vector<std::unique_ptr<se::Stream>> streams;
   std::vector<ServiceExecutableRunOptions> service_run_options;
-  std::vector<std::unique_ptr<ScopedShapedBuffer>> argument_buffers;
+
+  std::vector<ScopedShapedBuffer> argument_buffers;
+  // This reserve() call is necessary for correctness, because
+  // argument_buffer_ptrs contains pointers into the elements of
+  // argument_buffers.
+  argument_buffers.reserve(options.num_replicas * options.arguments.size());
+
   // Plus one so we can safely get &argument_buffer_ptrs[0] in case there are
   // no arguments.
   std::vector<const ShapedBuffer*> argument_buffer_ptrs(
@@ -175,13 +183,13 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
     // Copy arguments to device.
     for (const Literal* argument : options.arguments) {
       TF_ASSIGN_OR_RETURN(
-          std::unique_ptr<ScopedShapedBuffer> argument_buffer,
+          ScopedShapedBuffer argument_buffer,
           backend().transfer_manager()->AllocateScopedShapedBuffer(
               argument->shape(), backend().memory_allocator(), device));
       TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
-          executor, *argument, *argument_buffer));
+          executor, *argument, argument_buffer));
       argument_buffers.push_back(std::move(argument_buffer));
-      argument_buffer_ptrs[index++] = argument_buffers.back().get();
+      argument_buffer_ptrs[index++] = &argument_buffers.back();
     }
     argument_buffer_slices.emplace_back(
         &argument_buffer_ptrs[index - options.arguments.size()],
@@ -240,19 +248,18 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
   }
 
   LOG(INFO) << "Replicated execution started";
-  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<ShapedBuffer>> results,
+  TF_ASSIGN_OR_RETURN(std::vector<ShapedBuffer> results,
                       executable->ExecuteOnStreams(service_run_options,
                                                    argument_buffer_slices));
   LOG(INFO) << "Replicated execution terminated";
 
   std::vector<std::unique_ptr<Literal>> exec_results;
   for (int64 i = 0; i < options.num_replicas; ++i) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<ScopedShapedBuffer> result,
-                        ScopedShapedBuffer::MakeScoped(
-                            results[i].get(), backend().memory_allocator()));
+    ScopedShapedBuffer result(std::move(results[i]),
+                              backend().memory_allocator());
     TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
                         backend().transfer_manager()->TransferLiteralFromDevice(
-                            streams[i]->parent(), *result));
+                            streams[i]->parent(), result));
     exec_results.push_back(std::move(literal));
   }
   return std::move(exec_results);
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index acfa79ea75..6553000336 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -45,7 +45,7 @@ InterpreterExecutable::InterpreterExecutable(
 
 InterpreterExecutable::~InterpreterExecutable() {}
 
-StatusOr<std::unique_ptr<ShapedBuffer>> InterpreterExecutable::ExecuteOnStream(
+StatusOr<ShapedBuffer> InterpreterExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
@@ -88,12 +88,12 @@ StatusOr<std::unique_ptr<ShapedBuffer>> InterpreterExecutable::ExecuteOnStream(
       evaluator.Evaluate<std::unique_ptr<Literal>>(*computation, arg_literals));
 
   // Transform the result literal back into a ShapedBuffer.
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<ShapedBuffer> result,
+  TF_ASSIGN_OR_RETURN(ShapedBuffer result,
                       transfer_manager->AllocateShapedBuffer(
                           result_literal->shape(), run_options->allocator(),
                           executor->device_ordinal()));
   TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralToDevice(
-      executor, *result_literal, *result));
+      executor, *result_literal, result));
 
   uint64 end_micros = tensorflow::Env::Default()->NowMicros();
 
@@ -106,8 +106,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> InterpreterExecutable::ExecuteOnStream(
   return std::move(result);
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>>
-InterpreterExecutable::ExecuteAsyncOnStream(
+StatusOr<ShapedBuffer> InterpreterExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   return tensorflow::errors::Unimplemented(
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h
index 410110a1ad..c825a9a368 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@@ -43,12 +43,12 @@ class InterpreterExecutable : public Executable {
   InterpreterExecutable(std::unique_ptr<const HloModule> hlo_module);
   ~InterpreterExecutable() override;
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
+  StatusOr<ShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
+  StatusOr<ShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 2df59c3556..39f3aefdf8 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -550,7 +550,7 @@ Service::ExecuteParallelAndRegisterResult(
     // Stream executors for the replicas of the current computation.
     TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*backend, device_handles[i]));
     CHECK_EQ(replicas.size(), arguments[i].size());
-    std::vector<std::unique_ptr<ShapedBuffer>> result_buffers;
+    std::vector<ShapedBuffer> result_buffers;
     for (int64 replica = 0; replica < replicas.size(); ++replica) {
       TF_ASSIGN_OR_RETURN(Pool<se::Stream>::SmartPtr stream,
                           backend->BorrowStream(replicas[replica]));
@@ -582,7 +582,7 @@ Service::ExecuteParallelAndRegisterResult(
                                               backend->StreamBorrower());
 
       // Asynchronously launch the computation.
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<ShapedBuffer> result,
+      TF_ASSIGN_OR_RETURN(ShapedBuffer result,
                           executables[i]->ExecuteAsyncOnStream(
                               &run_options, arguments[i][replica]));
 
@@ -1234,7 +1234,7 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
     streams.push_back(std::move(stream));
   }
 
-  std::vector<std::unique_ptr<ShapedBuffer>> result_buffers;
+  std::vector<ShapedBuffer> result_buffers;
   for (size_t i = 0; i < streams.size(); ++i) {
     const auto& stream = streams[i];
     ExecutableRunOptions options;
@@ -1247,7 +1247,7 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
     ServiceExecutableRunOptions service_options(
         options, execute_backend_->StreamBorrower());
 
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<ShapedBuffer> this_result_buffer,
+    TF_ASSIGN_OR_RETURN(ShapedBuffer this_result_buffer,
                         executable->ExecuteAsyncOnStream(
                             &service_options, replicated_arguments[i]));
 
@@ -1347,16 +1347,16 @@ tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
   }
 
   // Allocate memory in each replica and transfer the data to all replicas.
-  std::vector<std::unique_ptr<ShapedBuffer>> replicated_buffers;
+  std::vector<ShapedBuffer> replicated_buffers;
   for (se::StreamExecutor* executor : replicas) {
     TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<ShapedBuffer> shaped_buffer,
+        ShapedBuffer shaped_buffer,
         execute_backend_->transfer_manager()->AllocateShapedBuffer(
             shape, execute_backend_->memory_allocator(),
             executor->device_ordinal()));
     TF_RETURN_IF_ERROR(
         execute_backend_->transfer_manager()->TransferLiteralToDevice(
-            executor, *literal, *shaped_buffer));
+            executor, *literal, shaped_buffer));
     replicated_buffers.emplace_back(std::move(shaped_buffer));
   }
   TF_ASSIGN_OR_RETURN(*result->mutable_data(),
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index 10a2aa2b30..0b5a383f6f 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -66,6 +66,8 @@ ShapedBuffer& ShapedBuffer::operator=(ShapedBuffer&& s) {
   return *this;
 }
 
+ShapedBuffer::~ShapedBuffer() {}
+
 void ShapedBuffer::clear() {
   for (auto& pair : buffers_) {
     // A default constructed DeviceMemoryBase is a null pointer.
@@ -102,18 +104,6 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer) {
   return out;
 }
 
-/* static */
-StatusOr<std::unique_ptr<ScopedShapedBuffer>> ScopedShapedBuffer::MakeScoped(
-    ShapedBuffer* shaped_buffer, DeviceMemoryAllocator* allocator) {
-  auto scoped_buffer = WrapUnique(new ScopedShapedBuffer(
-      shaped_buffer->on_host_shape(), shaped_buffer->on_device_shape(),
-      allocator, shaped_buffer->device_ordinal()));
-  scoped_buffer->buffers_ = shaped_buffer->buffers();
-  shaped_buffer->clear();
-
-  return std::move(scoped_buffer);
-}
-
 ScopedShapedBuffer::ScopedShapedBuffer(const Shape& on_host_shape,
                                        const Shape& on_device_shape,
                                        DeviceMemoryAllocator* allocator,
@@ -126,7 +116,25 @@ ScopedShapedBuffer::ScopedShapedBuffer(ShapedBuffer shaped_buffer,
                                        DeviceMemoryAllocator* allocator)
     : ShapedBuffer(std::move(shaped_buffer)), allocator_(allocator) {}
 
+ScopedShapedBuffer::ScopedShapedBuffer(ScopedShapedBuffer&& s)
+    : ShapedBuffer(std::move(s)), allocator_(s.allocator_) {
+  // Null out s.allocator_ so it doesn't try to free anything in its destructor.
+  s.allocator_ = nullptr;
+}
+
+ScopedShapedBuffer& ScopedShapedBuffer::operator=(ScopedShapedBuffer&& s) {
+  *static_cast<ShapedBuffer*>(this) = std::move(static_cast<ShapedBuffer&>(s));
+  allocator_ = s.allocator_;
+  // Null out s.allocator_ so it doesn't try to free anything in its destructor.
+  s.allocator_ = nullptr;
+  return *this;
+}
+
 ScopedShapedBuffer::~ScopedShapedBuffer() {
+  // allocator_ will be null if we were moved-from.
+  if (allocator_ == nullptr) {
+    return;
+  }
   // Deallocate all non-null buffers. A buffer may appear in more than one spot
   // in the shape (eg, a tuple with a repeated element) so keep track of what
   // has been deallocated.
@@ -142,8 +150,8 @@ ScopedShapedBuffer::~ScopedShapedBuffer() {
   }
 }
 
-std::unique_ptr<ShapedBuffer> ScopedShapedBuffer::release() {
-  auto shaped_buffer = MakeUnique<ShapedBuffer>(std::move(*this));
+ShapedBuffer ScopedShapedBuffer::release() {
+  ShapedBuffer shaped_buffer(std::move(*this));
   buffers_ = ShapeTree<se::DeviceMemoryBase>();
   return shaped_buffer;
 }
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index 62ba8f2734..f1b0527474 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -43,6 +43,14 @@ class ShapedBuffer {
   ShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape,
                const se::Platform* platform, int device_ordinal);
 
+  // Movable, but not copyable.
+  ShapedBuffer(ShapedBuffer&& s);
+  ShapedBuffer& operator=(ShapedBuffer&&);
+  ShapedBuffer(const ShapedBuffer&) = delete;
+  ShapedBuffer& operator=(const ShapedBuffer&) = delete;
+
+  virtual ~ShapedBuffer();
+
   // Returns the shape of the on-host representation of the data held by this
   // ShapedBuffer.
   const Shape& on_host_shape() const { return on_host_shape_; }
@@ -80,13 +88,7 @@ class ShapedBuffer {
 
   string ToString() const;
 
-  ShapedBuffer(ShapedBuffer&& s);
-  ShapedBuffer& operator=(ShapedBuffer&&);
-
  protected:
-  ShapedBuffer(const ShapedBuffer&) = delete;
-  ShapedBuffer& operator=(const ShapedBuffer&) = delete;
-
   // The shape of the data when represented on the host.
   Shape on_host_shape_;
 
@@ -108,41 +110,45 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer);
 // ShapedBuffer derived class which allocates all internal buffers on
 // construction and deallocates the memory when the object is
 // destructed.
+//
+// TODO(timshen): Remove inheritance between ScopedShapedBuffer and
+// ShapedBuffer.  There should never be a need to consider a ScopedShapedBuffer
+// as a ShapedBuffer, because in that case we should just be able to pass around
+// our ShapeTree<DeviceMemoryBase>.  Inheritance only adds complexity.  See
+// discussion in cl/192849370.
 class ScopedShapedBuffer : public ShapedBuffer {
  public:
-  // Takes a ShapedBuffer and returns a ScopedShapedBuffer which manages the
-  // deallocation of the device memory held in the shaped buffer. All device
-  // memory pointers in the given ShapedBuffer are set to null.
-  static StatusOr<std::unique_ptr<ScopedShapedBuffer>> MakeScoped(
-      ShapedBuffer* shaped_buffer, DeviceMemoryAllocator* allocator);
-
-  // Create a ScopedShapedBuffer with null DeviceMemoryBases at each index.
-  ScopedShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape,
-                     DeviceMemoryAllocator* allocator, int device_ordinal);
+  // Creates a ScopedShapedBuffer with null DeviceMemoryBases at each index.
+  explicit ScopedShapedBuffer(const Shape& on_host_shape,
+                              const Shape& on_device_shape,
+                              DeviceMemoryAllocator* allocator,
+                              int device_ordinal);
 
   // Create a ScopedShapedBuffer by taking over the memory from the incoming
   // ShapedBuffer.
-  ScopedShapedBuffer(ShapedBuffer shaped_buffer,
-                     DeviceMemoryAllocator* allocator);
+  explicit ScopedShapedBuffer(ShapedBuffer shaped_buffer,
+                              DeviceMemoryAllocator* allocator);
+
+  // Movable, but not copyable.
+  ScopedShapedBuffer(ScopedShapedBuffer&& s);
+  ScopedShapedBuffer& operator=(ScopedShapedBuffer&&);
+  ScopedShapedBuffer(const ScopedShapedBuffer&) = delete;
+  ScopedShapedBuffer& operator=(const ScopedShapedBuffer&) = delete;
+
+  // All buffers in the shape are deallocated on destruction.
+  ~ScopedShapedBuffer() override;
 
   // Return the allocator used to allocate the device memory held in this
   // ScopedShapedBuffer.
   DeviceMemoryAllocator* memory_allocator() const { return allocator_; }
 
-  // Release all device memory owned by this ScopedShapedBuffer and
-  // return the device memory pointers in the form of a
-  // ShapedBuffer. The returned ShapedBuffer takes over the memory
-  // from the ScopedShapedBuffer. The resulting ScopedShapedBuffer can
-  // only be destroyed.
-  std::unique_ptr<ShapedBuffer> release();
-
-  // All buffers in the shape are deallocated on destruction.
-  virtual ~ScopedShapedBuffer();
+  // Releases all device memory owned by this ScopedShapedBuffer and returns the
+  // device memory pointers in the form of a ShapedBuffer. The returned
+  // ShapedBuffer takes over the memory from the ScopedShapedBuffer. The
+  // resulting ScopedShapedBuffer can only be destroyed.
+  ShapedBuffer release();
 
  protected:
-  ScopedShapedBuffer(const ScopedShapedBuffer&) = delete;
-  void operator=(const ScopedShapedBuffer&) = delete;
-
   DeviceMemoryAllocator* allocator_;
 };
 
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index be8231b73c..98d0111d04 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -175,7 +175,7 @@ Status TransferManager::TransferBufferToDevice(
   return Status::OK();
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> TransferManager::AllocateShapedBuffer(
+StatusOr<ShapedBuffer> TransferManager::AllocateShapedBuffer(
     const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
     int device_ordinal) {
   if (!LayoutUtil::HasLayout(on_host_shape)) {
@@ -187,31 +187,30 @@ StatusOr<std::unique_ptr<ShapedBuffer>> TransferManager::AllocateShapedBuffer(
   const Shape on_device_shape = HostShapeToDeviceShape(on_host_shape);
   TF_RET_CHECK(LayoutUtil::HasLayout(on_device_shape));
 
-  auto shaped_buffer = WrapUnique(new ShapedBuffer(
-      on_host_shape, on_device_shape, allocator->platform(), device_ordinal));
+  ShapedBuffer shaped_buffer(on_host_shape, on_device_shape,
+                             allocator->platform(), device_ordinal);
 
   // Allocate an appropriate sized buffer for each element in the shape
   // including the tuple pointer arrays.
-  for (auto& pair : shaped_buffer->buffers()) {
+  for (auto& pair : shaped_buffer.buffers()) {
     const ShapeIndex& index = pair.first;
     se::DeviceMemoryBase& memory_base = pair.second;
     const Shape& subshape = ShapeUtil::GetSubshape(on_device_shape, index);
     TF_ASSIGN_OR_RETURN(memory_base,
-                        allocator->Allocate(shaped_buffer->device_ordinal(),
+                        allocator->Allocate(shaped_buffer.device_ordinal(),
                                             GetByteSizeRequirement(subshape)));
   }
 
   return std::move(shaped_buffer);
 }
 
-StatusOr<std::unique_ptr<ScopedShapedBuffer>>
-TransferManager::AllocateScopedShapedBuffer(const Shape& on_host_shape,
-                                            DeviceMemoryAllocator* allocator,
-                                            int device_ordinal) {
+StatusOr<ScopedShapedBuffer> TransferManager::AllocateScopedShapedBuffer(
+    const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
+    int device_ordinal) {
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> unscoped_buffer,
+      ShapedBuffer unscoped_buffer,
       AllocateShapedBuffer(on_host_shape, allocator, device_ordinal));
-  return ScopedShapedBuffer::MakeScoped(unscoped_buffer.get(), allocator);
+  return ScopedShapedBuffer(std::move(unscoped_buffer), allocator);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index 410d2af7af..a6451c4bb1 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -107,10 +107,10 @@ class TransferManager {
   // Allocate a ShapedBuffer which can hold data with the given on-host
   // shape. The on-device shape may be different as indicated by
   // HostShapeToDeviceShape.
-  StatusOr<std::unique_ptr<ShapedBuffer>> AllocateShapedBuffer(
-      const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
-      int device_ordinal);
-  StatusOr<std::unique_ptr<ScopedShapedBuffer>> AllocateScopedShapedBuffer(
+  StatusOr<ShapedBuffer> AllocateShapedBuffer(const Shape& on_host_shape,
+                                              DeviceMemoryAllocator* allocator,
+                                              int device_ordinal);
+  StatusOr<ScopedShapedBuffer> AllocateScopedShapedBuffer(
       const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
       int device_ordinal);
 
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 464b8cbebb..021fbcedb9 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -735,11 +735,11 @@ void BM_DynamicSlice(int num_iters) {
 
   auto start_indices_literal = Literal::CreateR1<int32>({0, 1, 2, 3});
   ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice(
-      executors[device_ordinal], *start_indices_literal, *buffer));
+      executors[device_ordinal], *start_indices_literal, buffer));
 
   std::unique_ptr<LocalExecutable> executable =
       client
-          ->Compile(computation, {&buffer->on_host_shape()},
+          ->Compile(computation, {&buffer.on_host_shape()},
                     ExecutableBuildOptions())
           .ConsumeValueOrDie();
 
@@ -748,14 +748,14 @@ void BM_DynamicSlice(int num_iters) {
   options.set_allocator(&allocator);
   const int kWarmups = 2;
   for (int i = 0; i < kWarmups; ++i) {
-    auto result = executable->Run({buffer.get()}, options);
+    auto result = executable->Run({&buffer}, options);
     ASSERT_TRUE(result.ok());
   }
 
   // Run benchmark.
   tensorflow::testing::StartTiming();
   for (int i = 0; i < num_iters; ++i) {
-    auto result = executable->Run({buffer.get()}, options);
+    auto result = executable->Run({&buffer}, options);
     ASSERT_TRUE(result.ok());
   }
 }
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index ed16963b40..c7f64d8560 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -794,19 +794,19 @@ void BM_ParallelFusion(int num_iters) {
   // Transfer literals to device.
   auto param0_literal =
       Literal::CreateR2F32Linspace(1.0, 2.0, param0_dim0, param0_dim1);
-  std::unique_ptr<ShapedBuffer> buffer0 =
+  ShapedBuffer buffer0 =
       client->LiteralToShapedBuffer(*param0_literal, device_ordinal)
           .ConsumeValueOrDie();
 
   auto param1_literal =
       Literal::CreateR2F32Linspace(1.0, 2.0, param1_dim0, param1_dim1);
-  std::unique_ptr<ShapedBuffer> buffer1 =
+  ShapedBuffer buffer1 =
       client->LiteralToShapedBuffer(*param1_literal, device_ordinal)
           .ConsumeValueOrDie();
 
   auto param2_literal =
       Literal::CreateR2F32Linspace(1.0, 2.0, param2_dim0, param2_dim1);
-  std::unique_ptr<ShapedBuffer> buffer2 =
+  ShapedBuffer buffer2 =
       client->LiteralToShapedBuffer(*param2_literal, device_ordinal)
           .ConsumeValueOrDie();
 
@@ -814,8 +814,8 @@ void BM_ParallelFusion(int num_iters) {
   std::unique_ptr<LocalExecutable> executable =
       client
           ->Compile(computation,
-                    {&buffer0->on_host_shape(), &buffer1->on_host_shape(),
-                     &buffer2->on_host_shape()},
+                    {&buffer0.on_host_shape(), &buffer1.on_host_shape(),
+                     &buffer2.on_host_shape()},
                     ExecutableBuildOptions())
           .ConsumeValueOrDie();
 
@@ -836,8 +836,7 @@ void BM_ParallelFusion(int num_iters) {
   // Run some warm-up executions.
   const int kWarmups = 2;
   for (int i = 0; i < kWarmups; ++i) {
-    auto result =
-        executable->Run({buffer0.get(), buffer1.get(), buffer2.get()}, options);
+    auto result = executable->Run({&buffer0, &buffer1, &buffer2}, options);
     ASSERT_TRUE(result.ok());
   }
 
@@ -850,8 +849,7 @@ void BM_ParallelFusion(int num_iters) {
   tensorflow::testing::UseRealTime();
   tensorflow::testing::StartTiming();
   for (int i = 0; i < num_iters; ++i) {
-    auto result =
-        executable->Run({buffer0.get(), buffer1.get(), buffer2.get()}, options);
+    auto result = executable->Run({&buffer0, &buffer1, &buffer2}, options);
     ASSERT_TRUE(result.ok());
   }
 }
diff --git a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
index 3d30ceeaf1..7209f91639 100644
--- a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/local_client_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -53,7 +54,7 @@ XLA_TEST_F(LocalClientAllocationTest, AddVectors) {
   // deallocation happen on the right allocator.
   ExecutableRunOptions options;
   options.set_allocator(allocator);
-  std::unique_ptr<ScopedShapedBuffer> result =
+  tensorflow::gtl::optional<ScopedShapedBuffer> result =
       ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {},
                           DefaultExecutableBuildOptions(), options);
 
@@ -66,7 +67,7 @@ XLA_TEST_F(LocalClientAllocationTest, AddVectors) {
 
   // Deallocate result and verify that deallocate was called once.
   int64 deallocation_count_before = allocator_->deallocation_count();
-  result = nullptr;
+  result.reset();
   EXPECT_EQ(deallocation_count_before + 1, allocator_->deallocation_count());
 }
 
@@ -92,7 +93,7 @@ XLA_TEST_F(LocalClientAllocationTest, RunOnDevices) {
         computation, {}, ExecutableBuildOptions().set_device_ordinal(d),
         ExecutableRunOptions().set_device_ordinal(d).set_allocator(allocator));
     LiteralTestUtil::ExpectR1Near<float>(
-        {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(*result), error_spec_);
+        {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_);
 
     // At least one allocation should have been performed when executing the
     // computation.
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index 373dd3c5df..7e14e77366 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -57,10 +57,9 @@ XLA_TEST_F(LocalClientExecuteTest, Constant) {
   ComputationBuilder builder(local_client_, TestName());
   auto y = builder.ConstantR0<float>(123.0f);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
+  ScopedShapedBuffer result =
       ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {});
-
-  LiteralTestUtil::ExpectR0Near<float>(123.f, *ShapedBufferToLiteral(*result),
+  LiteralTestUtil::ExpectR0Near<float>(123.f, *ShapedBufferToLiteral(result),
                                        error_spec_);
 }
 
@@ -71,10 +70,9 @@ XLA_TEST_F(LocalClientExecuteTest, AddScalars) {
   builder.Add(x, y);
 
   auto x_value = LiteralToShapedBuffer(*Literal::CreateR0<float>(42.0f));
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {x_value.get()});
-
-  LiteralTestUtil::ExpectR0Near<float>(165.f, *ShapedBufferToLiteral(*result),
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&x_value});
+  LiteralTestUtil::ExpectR0Near<float>(165.f, *ShapedBufferToLiteral(result),
                                        error_spec_);
 }
 
@@ -85,10 +83,9 @@ XLA_TEST_F(LocalClientExecuteTest, AddZeroElementVectors) {
   builder.Add(x, y);
 
   auto x_array = LiteralToShapedBuffer(*Literal::CreateR1<float>({}));
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {x_array.get()});
-
-  LiteralTestUtil::ExpectR1Near<float>({}, *ShapedBufferToLiteral(*result),
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&x_array});
+  LiteralTestUtil::ExpectR1Near<float>({}, *ShapedBufferToLiteral(result),
                                        error_spec_);
 }
 
@@ -100,11 +97,10 @@ XLA_TEST_F(LocalClientExecuteTest, AddVectors) {
 
   auto x_array =
       LiteralToShapedBuffer(*Literal::CreateR1<float>({0.0f, 1.0f, 2.0f}));
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {x_array.get()});
-
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&x_array});
   LiteralTestUtil::ExpectR1Near<float>(
-      {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(*result), error_spec_);
+      {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_);
 }
 
 XLA_TEST_F(LocalClientExecuteTest, AddVectorsWithProfile) {
@@ -116,13 +112,12 @@ XLA_TEST_F(LocalClientExecuteTest, AddVectorsWithProfile) {
   auto x_array =
       LiteralToShapedBuffer(*Literal::CreateR1<float>({0.0f, 1.0f, 2.0f}));
   ExecutionProfile profile;
-  std::unique_ptr<ScopedShapedBuffer> result = ExecuteLocallyOrDie(
-      builder.Build().ValueOrDie(), {x_array.get()},
-      DefaultExecutableBuildOptions(),
+  ScopedShapedBuffer result = ExecuteLocallyOrDie(
+      builder.Build().ValueOrDie(), {&x_array}, DefaultExecutableBuildOptions(),
       DefaultExecutableRunOptions().set_execution_profile(&profile));
 
   LiteralTestUtil::ExpectR1Near<float>(
-      {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(*result), error_spec_);
+      {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_);
   EXPECT_GT(profile.compute_and_transfer_time_ns(), 0);
 }
 
@@ -136,27 +131,27 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentInputLayouts) {
   // Create x as a col-major array.
   auto x_array = LiteralToShapedBuffer(*Literal::CreateR2WithLayout(
       {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({0, 1})));
-  EXPECT_TRUE(LayoutUtil::Equal(x_array->on_device_shape().layout(),
+  EXPECT_TRUE(LayoutUtil::Equal(x_array.on_device_shape().layout(),
                                 LayoutUtil::MakeLayout({0, 1})));
 
   // Create y as a row-major array.
   auto y_array = LiteralToShapedBuffer(*Literal::CreateR2WithLayout(
       {{10.0f, 20.0f}, {30.0f, 40.0f}}, LayoutUtil::MakeLayout({1, 0})));
-  EXPECT_TRUE(LayoutUtil::Equal(y_array->on_device_shape().layout(),
+  EXPECT_TRUE(LayoutUtil::Equal(y_array.on_device_shape().layout(),
                                 LayoutUtil::MakeLayout({1, 0})));
 
-  std::unique_ptr<ScopedShapedBuffer> result_colmaj =
-      ExecuteLocallyOrDie(computation, {x_array.get(), y_array.get()});
+  ScopedShapedBuffer result_colmaj =
+      ExecuteLocallyOrDie(computation, {&x_array, &y_array});
   LiteralTestUtil::ExpectR2Near<float>({{11.0f, 22.0f}, {33.0f, 44.0f}},
-                                       *ShapedBufferToLiteral(*result_colmaj),
+                                       *ShapedBufferToLiteral(result_colmaj),
                                        error_spec_);
 
   // Run with the parameter values in a different order.
-  std::unique_ptr<ScopedShapedBuffer> result_param_swap =
-      ExecuteLocallyOrDie(computation, {y_array.get(), x_array.get()});
+  ScopedShapedBuffer result_param_swap =
+      ExecuteLocallyOrDie(computation, {&y_array, &x_array});
   LiteralTestUtil::ExpectR2Near<float>(
       {{11.0f, 22.0f}, {33.0f, 44.0f}},
-      *ShapedBufferToLiteral(*result_param_swap), error_spec_);
+      *ShapedBufferToLiteral(result_param_swap), error_spec_);
 }
 
 XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentOutputLayouts) {
@@ -172,27 +167,27 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentOutputLayouts) {
       *Literal::CreateR2<float>({{10.0f, 20.0f}, {30.0f, 40.0f}}));
 
   // Run with col-major result layout.
-  std::unique_ptr<ScopedShapedBuffer> result_colmaj = ExecuteLocallyOrDie(
-      computation, {x_array.get(), y_array.get()},
+  ScopedShapedBuffer result_colmaj = ExecuteLocallyOrDie(
+      computation, {&x_array, &y_array},
       DefaultExecutableBuildOptions().set_result_layout(
           ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2}, {0, 1})),
       DefaultExecutableRunOptions());
-  EXPECT_TRUE(LayoutUtil::Equal(result_colmaj->on_device_shape().layout(),
+  EXPECT_TRUE(LayoutUtil::Equal(result_colmaj.on_device_shape().layout(),
                                 LayoutUtil::MakeLayout({0, 1})));
   LiteralTestUtil::ExpectR2Near<float>({{11.0f, 22.0f}, {33.0f, 44.0f}},
-                                       *ShapedBufferToLiteral(*result_colmaj),
+                                       *ShapedBufferToLiteral(result_colmaj),
                                        error_spec_);
 
   // Run with row-major result layout.
-  std::unique_ptr<ScopedShapedBuffer> result_rowmaj = ExecuteLocallyOrDie(
-      computation, {x_array.get(), y_array.get()},
+  ScopedShapedBuffer result_rowmaj = ExecuteLocallyOrDie(
+      computation, {&x_array, &y_array},
       DefaultExecutableBuildOptions().set_result_layout(
           ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2}, {1, 0})),
       DefaultExecutableRunOptions());
-  EXPECT_TRUE(LayoutUtil::Equal(result_rowmaj->on_device_shape().layout(),
+  EXPECT_TRUE(LayoutUtil::Equal(result_rowmaj.on_device_shape().layout(),
                                 LayoutUtil::MakeLayout({1, 0})));
   LiteralTestUtil::ExpectR2Near<float>({{11.0f, 22.0f}, {33.0f, 44.0f}},
-                                       *ShapedBufferToLiteral(*result_rowmaj),
+                                       *ShapedBufferToLiteral(result_rowmaj),
                                        error_spec_);
 }
 
@@ -208,13 +203,13 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResult) {
   auto y_array = LiteralToShapedBuffer(
       *Literal::CreateR2<float>({{10.0f, 20.0f}, {30.0f, 40.0f}}));
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {x_array.get(), y_array.get()});
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(computation, {&x_array, &y_array});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape()));
-  EXPECT_EQ(3, ShapeUtil::TupleElementCount(result->on_host_shape()));
+  EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape()));
+  EXPECT_EQ(3, ShapeUtil::TupleElementCount(result.on_host_shape()));
 
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {0}));
   LiteralTestUtil::ExpectR2Equal<float>(
@@ -237,13 +232,13 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) {
   auto y_array = LiteralToShapedBuffer(
       *Literal::CreateR2<float>({{10.0f, 20.0f}, {30.0f, 40.0f}}));
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {x_array.get(), y_array.get()});
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(computation, {&x_array, &y_array});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape()));
-  EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->on_host_shape()));
+  EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape()));
+  EXPECT_EQ(2, ShapeUtil::TupleElementCount(result.on_host_shape()));
 
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {1}));
   LiteralTestUtil::ExpectR2Equal<float>(
@@ -274,11 +269,11 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResultWithLayout) {
        ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2},
                                       /*minor_to_major=*/{1, 0})});
   options.set_result_layout(shape_with_layout);
-  std::unique_ptr<ScopedShapedBuffer> result = ExecuteLocallyOrDie(
-      builder.Build().ValueOrDie(), {array.get(), array.get()}, options,
-      DefaultExecutableRunOptions());
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&array, &array},
+                          options, DefaultExecutableRunOptions());
 
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {0}));
   LiteralTestUtil::ExpectR2Equal<float>(
@@ -318,13 +313,13 @@ XLA_TEST_F(LocalClientExecuteTest, TupleArguments) {
   auto x_buffer = LiteralToShapedBuffer(*x_literal);
   auto y_buffer = LiteralToShapedBuffer(*y_literal);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {x_buffer.get(), y_buffer.get()});
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(computation, {&x_buffer, &y_buffer});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape()));
-  EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->on_host_shape()));
+  EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape()));
+  EXPECT_EQ(2, ShapeUtil::TupleElementCount(result.on_host_shape()));
 
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{56.0f, 46.0f}, {36.0f, 26.0f}},
       LiteralView::Create(*result_literal, {0}));
@@ -363,10 +358,9 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleArgument) {
        Literal::CreateR1<float>({222.0, -2.0, 10.0}).get()});
   auto arg_buffer = LiteralToShapedBuffer(*arg_literal);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {arg_buffer.get()});
+  ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer});
 
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{-1.0, -2.0}, {-3.0, -4}}, LiteralView::Create(*result_literal, {0}));
   LiteralTestUtil::ExpectR1Equal<float>(
@@ -394,18 +388,16 @@ XLA_TEST_F(LocalClientExecuteTest, PassingTupleResultBackIntoComputation) {
        Literal::CreateR2<float>({{11.0, 3.0}, {4.0, 5.0}}).get()});
   auto arg_buffer = LiteralToShapedBuffer(*arg_literal);
 
-  std::unique_ptr<ScopedShapedBuffer> result_0 =
-      ExecuteLocallyOrDie(computation, {arg_buffer.get()});
-  std::unique_ptr<Literal> result_0_literal = ShapedBufferToLiteral(*result_0);
+  ScopedShapedBuffer result_0 = ExecuteLocallyOrDie(computation, {&arg_buffer});
+  std::unique_ptr<Literal> result_0_literal = ShapedBufferToLiteral(result_0);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{-1.0, -2.0}, {-3.0, -4.0}},
       LiteralView::Create(*result_0_literal, {0}));
   LiteralTestUtil::ExpectR2Equal<float>(
       {{22.0, 6.0}, {8.0, 10}}, LiteralView::Create(*result_0_literal, {1}));
 
-  std::unique_ptr<ScopedShapedBuffer> result_1 =
-      ExecuteLocallyOrDie(computation, {result_0.get()});
-  std::unique_ptr<Literal> result_1_literal = ShapedBufferToLiteral(*result_1);
+  ScopedShapedBuffer result_1 = ExecuteLocallyOrDie(computation, {&result_0});
+  std::unique_ptr<Literal> result_1_literal = ShapedBufferToLiteral(result_1);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{1.0, 2.0}, {3.0, 4.0}}, LiteralView::Create(*result_1_literal, {0}));
   LiteralTestUtil::ExpectR2Equal<float>(
@@ -451,10 +443,8 @@ XLA_TEST_F(LocalClientExecuteTest, LargeTuple) {
       Literal::MakeTupleOwned(std::move(arg_elements));
   auto arg_buffer = LiteralToShapedBuffer(*arg_literal);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {arg_buffer.get()});
-
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer});
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
 
   for (int i = 0; i < kElementCount; ++i) {
     LiteralTestUtil::ExpectR1Near<float>(
@@ -509,9 +499,8 @@ XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_CPU_PARALLEL(LargeNestedTuple)) {
   auto arg_literal = Literal::MakeTupleOwned(std::move(outer_tuple_elements));
   auto arg_buffer = LiteralToShapedBuffer(*arg_literal);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {arg_buffer.get()});
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer});
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
 
   for (int i = 0; i < kFanout; ++i) {
     for (int j = 0; j < kFanout; ++j) {
@@ -554,9 +543,8 @@ XLA_TEST_F(LocalClientExecuteTest, DeepTuple) {
   }
   auto arg_buffer = LiteralToShapedBuffer(*arg_literal);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {arg_buffer.get()});
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer});
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
 
   ShapeIndex index;
   for (int i = 0; i < kTupleDepth; ++i) {
@@ -576,7 +564,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidNumberOfArguments) {
   auto x_array =
       LiteralToShapedBuffer(*Literal::CreateR1<float>({1.0f, 2.0f, 3.0f}));
   auto execute_status =
-      ExecuteLocally(builder.Build().ValueOrDie(), {x_array.get()});
+      ExecuteLocally(builder.Build().ValueOrDie(), {&x_array});
 
   EXPECT_FALSE(execute_status.ok());
   EXPECT_THAT(execute_status.status().error_message(),
@@ -592,7 +580,7 @@ XLA_TEST_F(LocalClientExecuteTest, IncorrectArgumentShape) {
   auto x_array = LiteralToShapedBuffer(
       *Literal::CreateR2<float>({{0.0f, 1.0f}, {2.0f, 3.0f}}));
   auto execute_status =
-      ExecuteLocally(builder.Build().ValueOrDie(), {x_array.get()});
+      ExecuteLocally(builder.Build().ValueOrDie(), {&x_array});
 
   EXPECT_FALSE(execute_status.ok());
   EXPECT_THAT(execute_status.status().error_message(),
@@ -609,7 +597,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidResultLayout) {
   auto x_array = LiteralToShapedBuffer(
       *Literal::CreateR2<float>({{0.0f, 1.0f}, {2.0f, 3.0f}}));
   auto execute_status = ExecuteLocally(
-      builder.Build().ValueOrDie(), {x_array.get()},
+      builder.Build().ValueOrDie(), {&x_array},
       DefaultExecutableBuildOptions().set_result_layout(
           ShapeUtil::MakeShapeWithLayout(F32,
                                          /*dimensions=*/{1, 2, 3, 4},
@@ -642,9 +630,9 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnAllDeviceOrdinals) {
           computation, {},
           DefaultExecutableBuildOptions().set_device_ordinal(d),
           DefaultExecutableRunOptions().set_device_ordinal(d));
-      EXPECT_EQ(d, result->device_ordinal());
+      EXPECT_EQ(d, result.device_ordinal());
       LiteralTestUtil::ExpectR0Equal<float>(42.0f,
-                                            *ShapedBufferToLiteral(*result));
+                                            *ShapedBufferToLiteral(result));
     }
   }
 }
@@ -687,9 +675,9 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnStream) {
                             DefaultExecutableRunOptions().set_stream(&stream));
     // As a check to verify that the computation ran of the device associated
     // with the stream. This is a weak check, but stronger verification is hard.
-    EXPECT_EQ(d, result->device_ordinal());
+    EXPECT_EQ(d, result.device_ordinal());
     LiteralTestUtil::ExpectR0Equal<float>(42.0f,
-                                          *ShapedBufferToLiteral(*result));
+                                          *ShapedBufferToLiteral(result));
   }
 }
 
@@ -765,9 +753,9 @@ XLA_TEST_F(LocalClientExecuteTest, SelectBetweenTuples) {
       {builder.ConstantR1<float>(vec2), builder.ConstantR1<float>(vec1)});
   builder.Select(builder.ConstantR0<bool>(false), tuple12, tuple21);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
+  ScopedShapedBuffer result =
       ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {});
-  std::unique_ptr<Literal> tuple_literal = ShapedBufferToLiteral(*result);
+  std::unique_ptr<Literal> tuple_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR1Equal<float>(
       {2.0f, 4.0f, 6.0f}, LiteralView::Create(*tuple_literal, {0}));
   LiteralTestUtil::ExpectR1Equal<float>(
@@ -791,12 +779,12 @@ XLA_TEST_F(LocalClientExecuteTest, CompileExecutable) {
 
   auto x_array =
       LiteralToShapedBuffer(*Literal::CreateR1<float>({0.0f, 1.0f, 2.0f}));
-  std::unique_ptr<ScopedShapedBuffer> result =
-      executable->Run({x_array.get()}, DefaultExecutableRunOptions())
+  ScopedShapedBuffer result =
+      executable->Run({&x_array}, DefaultExecutableRunOptions())
           .ConsumeValueOrDie();
 
   LiteralTestUtil::ExpectR1Near<float>(
-      {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(*result), error_spec_);
+      {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_);
 }
 
 XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion) {
@@ -809,7 +797,7 @@ XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion) {
             literal, local_client_->default_device_ordinal(), allocator_));
     TF_ASSERT_OK_AND_ASSIGN(
         auto transferred_literal,
-        local_client_->ShapedBufferToLiteral(*shaped_buffer));
+        local_client_->ShapedBufferToLiteral(shaped_buffer));
     EXPECT_EQ(literal, *transferred_literal);
   };
 
@@ -849,7 +837,7 @@ XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion64bit) {
             literal, local_client_->default_device_ordinal(), allocator_));
     TF_ASSERT_OK_AND_ASSIGN(
         auto transferred_literal,
-        local_client_->ShapedBufferToLiteral(*shaped_buffer));
+        local_client_->ShapedBufferToLiteral(shaped_buffer));
     EXPECT_EQ(literal, *transferred_literal);
   };
 
@@ -917,12 +905,12 @@ void BM_LocalClientOverhead(int num_iters) {
           .ConsumeValueOrDie();
   auto literal = Literal::CreateR2<float>({{0, 0, 0}, {0, 0, 0}});
   ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice(
-      executors[device_ordinal], *literal, *buffer));
+      executors[device_ordinal], *literal, buffer));
 
   const int kWarmups = 2;
 
   auto executable_status = client->Compile(
-      computation, {&buffer->on_host_shape()}, ExecutableBuildOptions());
+      computation, {&buffer.on_host_shape()}, ExecutableBuildOptions());
   ASSERT_IS_OK(executable_status);
   std::unique_ptr<LocalExecutable> executable =
       executable_status.ConsumeValueOrDie();
@@ -934,13 +922,13 @@ void BM_LocalClientOverhead(int num_iters) {
   run_options.set_allocator(&allocator).set_stream(&stream);
 
   for (int i = 0; i < kWarmups; ++i) {
-    auto result = executable->Run({buffer.get()}, run_options);
+    auto result = executable->Run({&buffer}, run_options);
     ASSERT_IS_OK(result);
   }
 
   tensorflow::testing::StartTiming();
   for (int i = 0; i < num_iters; ++i) {
-    auto result = executable->Run({buffer.get()}, run_options);
+    auto result = executable->Run({&buffer}, run_options);
     ASSERT_IS_OK(result);
   }
 }
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index 29fd985acf..c60ba2422f 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -128,7 +128,7 @@ LocalClientTestBase::LocalClientTestBase(se::Platform* platform)
 
 LocalClientTestBase::~LocalClientTestBase() {}
 
-std::unique_ptr<ScopedShapedBuffer> LocalClientTestBase::LiteralToShapedBuffer(
+ScopedShapedBuffer LocalClientTestBase::LiteralToShapedBuffer(
     const Literal& literal) {
   return local_client_
       ->LiteralToShapedBuffer(literal, local_client_->default_device_ordinal())
@@ -155,7 +155,7 @@ ExecutableRunOptions LocalClientTestBase::DefaultExecutableRunOptions() const {
   return run_options;
 }
 
-std::unique_ptr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocallyOrDie(
+ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie(
     const Computation& computation,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   return ExecuteLocally(computation, arguments, DefaultExecutableBuildOptions(),
@@ -163,7 +163,7 @@ std::unique_ptr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocallyOrDie(
       .ConsumeValueOrDie();
 }
 
-std::unique_ptr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocallyOrDie(
+ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie(
     const Computation& computation,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     const ExecutableBuildOptions& build_options,
@@ -172,16 +172,14 @@ std::unique_ptr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocallyOrDie(
       .ConsumeValueOrDie();
 }
 
-StatusOr<std::unique_ptr<ScopedShapedBuffer>>
-LocalClientTestBase::ExecuteLocally(
+StatusOr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocally(
     const Computation& computation,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   return ExecuteLocally(computation, arguments, DefaultExecutableBuildOptions(),
                         DefaultExecutableRunOptions());
 }
 
-StatusOr<std::unique_ptr<ScopedShapedBuffer>>
-LocalClientTestBase::ExecuteLocally(
+StatusOr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocally(
     const Computation& computation,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     const ExecutableBuildOptions& build_options,
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h
index 7555d5e893..4ee56a05ec 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@@ -83,8 +83,7 @@ class LocalClientTestBase : public ::testing::Test {
   // Copy the given literal onto the default device and return a
   // ScopedShapedBuffer. Convenience wrapper around
   // LocalClient::LiteralToShapedBuffer.
-  std::unique_ptr<ScopedShapedBuffer> LiteralToShapedBuffer(
-      const Literal& literal);
+  ScopedShapedBuffer LiteralToShapedBuffer(const Literal& literal);
 
   // Construct and return a literal containing the array represented by
   // shaped_buffer.
@@ -93,19 +92,19 @@ class LocalClientTestBase : public ::testing::Test {
 
   // Execute the given computation on the local client. With and without
   // options.
-  StatusOr<std::unique_ptr<ScopedShapedBuffer>> ExecuteLocally(
+  StatusOr<ScopedShapedBuffer> ExecuteLocally(
       const Computation& computation,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments);
-  StatusOr<std::unique_ptr<ScopedShapedBuffer>> ExecuteLocally(
+  StatusOr<ScopedShapedBuffer> ExecuteLocally(
       const Computation& computation,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       const ExecutableBuildOptions& build_options,
       const ExecutableRunOptions& run_options);
 
-  std::unique_ptr<ScopedShapedBuffer> ExecuteLocallyOrDie(
+  ScopedShapedBuffer ExecuteLocallyOrDie(
       const Computation& computation,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments);
-  std::unique_ptr<ScopedShapedBuffer> ExecuteLocallyOrDie(
+  ScopedShapedBuffer ExecuteLocallyOrDie(
       const Computation& computation,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       const ExecutableBuildOptions& build_options,
diff --git a/tensorflow/compiler/xla/tests/transfer_manager_test.cc b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
index 268ba338f2..e2067bc1b8 100644
--- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc
+++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
@@ -45,7 +45,7 @@ class TransferManagerTest : public LocalClientTestBase {
 
   ~TransferManagerTest() override = default;
 
-  std::unique_ptr<ScopedShapedBuffer> AllocateDeviceBuffer(const Shape& shape) {
+  ScopedShapedBuffer AllocateDeviceBuffer(const Shape& shape) {
     return transfer_manager_
         ->AllocateScopedShapedBuffer(
             shape, GetOrCreateAllocator(local_client_->platform()),
@@ -64,10 +64,10 @@ XLA_TEST_F(TransferManagerTest, TransferR0U32) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectR0Equal<uint32>(42, *result);
 }
@@ -80,10 +80,10 @@ XLA_TEST_F(TransferManagerTest, TransferR1F32) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectR1Equal<float>({1.25f, 2.5f, -17.0f, -20.125f},
                                         *result);
@@ -98,10 +98,10 @@ XLA_TEST_F(TransferManagerTest, TransferR1LargeF32) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectR1Equal<float>(test_vector, *result);
 }
@@ -114,10 +114,10 @@ XLA_TEST_F(TransferManagerTest, TransferR1U8) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   EXPECT_EQ(result->GetR1U8AsString(), test_string);
 }
@@ -130,10 +130,10 @@ XLA_TEST_F(TransferManagerTest, TransferR2F32) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectR2Equal<float>(
       {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, *result);
@@ -150,10 +150,10 @@ XLA_TEST_F(TransferManagerTest,
   // Round trip literal through device. Set the on-device layout to something
   // different than the literal layout.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   EXPECT_FALSE(
       LayoutUtil::Equal(result->shape().layout(), literal->shape().layout()));
@@ -170,10 +170,10 @@ XLA_TEST_F(TransferManagerTest, TransferTuple) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectEqual(*literal, *result);
 }
@@ -184,10 +184,10 @@ XLA_TEST_F(TransferManagerTest, TransferEmptyTuple) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectEqual(*literal, *result);
 }
@@ -204,10 +204,10 @@ XLA_TEST_F(TransferManagerTest, TransferNestedTuple) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectEqual(*literal, *result);
 }
@@ -219,10 +219,10 @@ XLA_TEST_F(TransferManagerTest, TransferComplexValue) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectEqual(*literal, *result);
 }
@@ -238,10 +238,10 @@ XLA_TEST_F(TransferManagerTest, TransferComplexValueInTuple) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectEqual(*literal, *result);
 }
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index efb00d56c5..837a01e873 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -129,18 +129,18 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
   auto* transfer_manager = backend->transfer_manager();
 
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<ScopedShapedBuffer> lhs_arg,
+      ScopedShapedBuffer lhs_arg,
       transfer_manager->AllocateScopedShapedBuffer(
           lhs_arg_shape, allocator, backend->default_device_ordinal()));
   TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice(
-      executor, *Literal::CreateFromShape(lhs_arg_shape), *lhs_arg));
+      executor, *Literal::CreateFromShape(lhs_arg_shape), lhs_arg));
 
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<ScopedShapedBuffer> rhs_arg,
+      ScopedShapedBuffer rhs_arg,
       transfer_manager->AllocateScopedShapedBuffer(
           rhs_arg_shape, allocator, backend->default_device_ordinal()));
   TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice(
-      executor, *Literal::CreateFromShape(rhs_arg_shape), *rhs_arg));
+      executor, *Literal::CreateFromShape(rhs_arg_shape), rhs_arg));
 
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LocalExecutable> local_executable,
@@ -165,7 +165,7 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
       backend->eigen_intra_op_thread_pool());
   TF_ASSERT_OK_AND_ASSIGN(
       auto execution_result,
-      executable->ExecuteOnStream(&run_options, {lhs_arg.get(), rhs_arg.get()},
+      executable->ExecuteOnStream(&run_options, {&lhs_arg, &rhs_arg},
                                   &hlo_execution_profile));
   (void)execution_result;
 
-- 
GitLab


From d710d01a015fda65348ac0e5c25be3747624a779 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Thu, 19 Apr 2018 17:21:50 -0700
Subject: [PATCH 190/450] Minor code refactoring.

PiperOrigin-RevId: 193600173
---
 tensorflow/core/kernels/data/BUILD            |  3 ++-
 tensorflow/core/kernels/data/dataset_utils.cc | 13 +++++++++++++
 tensorflow/core/kernels/data/dataset_utils.h  |  2 ++
 tensorflow/core/kernels/data/iterator_ops.cc  | 13 ++-----------
 tensorflow/core/kernels/data/writer_ops.cc    | 15 ++-------------
 5 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 667a6967a8..c78e0aff83 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -515,6 +515,7 @@ tf_kernel_library(
     srcs = ["iterator_ops.cc"],
     deps = [
         ":dataset",
+        ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -586,7 +587,7 @@ tf_kernel_library(
     srcs = ["writer_ops.cc"],
     deps = [
         ":dataset",
-        "//tensorflow/core:core_cpu_internal",
+        ":dataset_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index e3a3601ee8..67ddb52d57 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/common_runtime/device.h"
 
 namespace tensorflow {
 
@@ -45,6 +46,18 @@ Status MakeIteratorFromInputElement(
   return Status::OK();
 }
 
+IteratorContext MakeIteratorContext(OpKernelContext* ctx) {
+  IteratorContext::Params params;
+  params.env = ctx->env();
+  params.runner = *(ctx->runner());
+  params.lib = ctx->function_library();
+  DeviceBase* device = ctx->function_library()->device();
+  params.allocator_getter = [device](AllocatorAttributes attrs) {
+    return device->GetAllocator(attrs);
+  };
+  return IteratorContext(params);
+}
+
 }  // namespace dataset
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index 6c4191c2be..e5ca71dd99 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -28,6 +28,8 @@ Status MakeIteratorFromInputElement(
     int64 thread_index, CapturedFunction* captured_func, StringPiece prefix,
     std::unique_ptr<IteratorBase>* out_iterator);
 
+IteratorContext MakeIteratorContext(OpKernelContext* ctx);
+
 }  // namespace dataset
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 4e4997d7b3..f5db97fd59 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -609,17 +610,7 @@ class ToSingleElementOp : public AsyncOpKernel {
           ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
       auto iterator = dataset->MakeIterator("SingleElementIterator");
 
-      IteratorContext::Params params;
-      params.env = ctx->env();
-      params.runner = *(ctx->runner());
-      params.lib = ctx->function_library();
-      DeviceBase* device = ctx->function_library()->device();
-      params.allocator_getter = [device](AllocatorAttributes attrs) {
-        return device->GetAllocator(attrs);
-      };
-
-      IteratorContext iter_ctx(std::move(params));
-
+      IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
       std::vector<Tensor> components;
       components.reserve(dataset->output_dtypes().size());
       bool end_of_sequence;
diff --git a/tensorflow/core/kernels/data/writer_ops.cc b/tensorflow/core/kernels/data/writer_ops.cc
index 46821fd7b3..656fee1e85 100644
--- a/tensorflow/core/kernels/data/writer_ops.cc
+++ b/tensorflow/core/kernels/data/writer_ops.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/record_writer.h"
@@ -72,21 +72,10 @@ class ToTFRecordOp : public AsyncOpKernel {
           ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
       auto iterator = dataset->MakeIterator("ToTFRecordOpIterator");
 
-      IteratorContext::Params params;  // TODO(b/78245447)
-      params.env = ctx->env();
-      params.runner = *(ctx->runner());
-      params.lib = ctx->function_library();
-      DeviceBase* device = ctx->function_library()->device();
-      params.allocator_getter = [device](AllocatorAttributes attrs) {
-        return device->GetAllocator(attrs);
-      };
-
-      IteratorContext iter_ctx(std::move(params));
-
+      IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
       std::vector<Tensor> components;
       components.reserve(dataset->output_dtypes().size());
       bool end_of_sequence;
-
       do {
         OP_REQUIRES_OK_ASYNC(
             ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
-- 
GitLab


From c2905469335715929c630d2bd70068ccbc8eb2d1 Mon Sep 17 00:00:00 2001
From: manhyuk <manhyuk@kw.ac.kr>
Date: Fri, 20 Apr 2018 09:28:37 +0900
Subject: [PATCH 191/450] fix typo

---
 tensorflow/core/grappler/costs/virtual_scheduler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 5116c8183c..7edd10e3e8 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -212,7 +212,7 @@ class FirstReadyManager : public ReadyNodeManager {
 };
 
 // CompositeNodeManager has a few other NodeManagers: per-device LIFO for normal
-// ops (neither _Send nor _Recv) and FirstyReadyManagers for _Send ops and _Recv
+// ops (neither _Send nor _Recv) and FirstReadyManagers for _Send ops and _Recv
 // ops, and then it chooses FirstReady among the ops chosen from each
 // internal NodeManagers. The objective is to maximize producer-consumer
 // locality within device, while processing nodes across devices, including
-- 
GitLab


From 28a95990bf9ff228abec6a52389a4244a17a9101 Mon Sep 17 00:00:00 2001
From: manhyuk <manhyuk@kw.ac.kr>
Date: Fri, 20 Apr 2018 09:28:45 +0900
Subject: [PATCH 192/450] fix typo

---
 tensorflow/core/grappler/costs/virtual_scheduler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 7edd10e3e8..67bf1e6980 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -199,7 +199,7 @@ class FirstReadyManager : public ReadyNodeManager {
   // current node.
   std::vector<const NodeDef*> nodes_;
   // Newly added nodes are added to waiting_queue_. That way, GetCurrNode(),
-  // wihch returns the front of the nodes_, always returns the same node,
+  // which returns the front of the nodes_, always returns the same node,
   // even if any of new nodes has time_ready smaller than the current node's.
   std::vector<const NodeDef*> waiting_queue_;
   // Comparator functor for heap; stl heap is max heap, so we use "greater than"
-- 
GitLab


From c18a80967e55350affafbf2ff562056d4bddf234 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 17:26:41 -0700
Subject: [PATCH 193/450] Add support for non-Tensor args in recompute_grad

Previously, the function decorated by recompute_grad had to have a signature that contained only positional arguments, and all those arguments had to be Tensors. Most "layers" users define however have non-Tensor arguments (for example, various hyperparameters) and often have keyword arguments as well. This change allows a user to use whatever function signature they wish while being explicit about which arguments are Tensors.

PiperOrigin-RevId: 193600682
---
 .../layers/python/layers/rev_block_lib.py     |  77 +++++++++++--
 .../python/layers/rev_block_lib_test.py       | 102 ++++++++++++++++++
 2 files changed, 168 insertions(+), 11 deletions(-)

diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
index 02d294c68f..9f904cc302 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
@@ -45,6 +45,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_inspect
 
 __all__ = ["rev_block", "RevBlock", "recompute_grad"]
 
@@ -429,12 +430,13 @@ def enable_with_args(dec):
 
 
 @enable_with_args
-def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
+def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False,
+                   tensor_arg_names=None):
   """Decorator that recomputes the function on the backwards pass.
 
   Args:
-    fn: a function that takes Tensors (all as positional arguments) and returns
-      a tuple of Tensors.
+    fn: the subgraph-producing function to wrap and recompute when computing
+      gradients. Provide `tensor_arg_names` if not all arguments are `Tensor`s.
     use_data_dep: `bool`, if `True` will use a dummy data dependency to force
       the recompute to happen. If `False` will use a control dependency. By
       default will be `True` if in an XLA context and `False` otherwise. XLA
@@ -443,17 +445,25 @@ def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
       that all gradients are produced before any are consumed by downstream ops.
       If `use_data_dep` is also `True`, will use a data dependency instead of
       a control dependency.
+    tensor_arg_names: `list<str>`, names of the `Tensor` arguments to `fn`. If
+      `None`, assumes all arguments are `Tensor`s.
 
   Returns:
     A wrapped fn that is identical to fn when called, but its activations will
     be discarded and recomputed on the backwards pass (i.e. on a call to
     tf.gradients).
   """
+  if tensor_arg_names:
+    if not isinstance(tensor_arg_names, (list, tuple)):
+      raise TypeError("tensor_arg_names must be a list")
 
   @functools.wraps(fn)
-  def wrapped(*args):
+  def wrapped(*args, **kwargs):
+    tensor_only_fn, tensor_args = _make_tensor_only(fn, args, kwargs,
+                                                    tensor_arg_names)
     return _recompute_grad(
-        fn, args, use_data_dep=use_data_dep, tupleize_grads=tupleize_grads)
+        tensor_only_fn, tensor_args, use_data_dep=use_data_dep,
+        tupleize_grads=tupleize_grads)
 
   return wrapped
 
@@ -463,11 +473,59 @@ def _is_on_tpu():
   return control_flow_util.GetContainingXLAContext(ctxt) is not None
 
 
-def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
+def _make_tensor_only(fn, args, kwargs, tensor_arg_names):
+  """Return fn such that it only takes Tensor args for tensor_arg_names."""
+  argspec = tf_inspect.getargspec(fn)
+  if argspec.varargs is not None or argspec.keywords is not None:
+    raise ValueError("Function decorated with recompute_grad must not use "
+                     "*args or **kwargs.")
+  fn_arg_names = list(argspec.args)
+
+  # name_to_arg is a dict of argument name to argument value, including both
+  # positional and keyword arguments passed.
+  name_to_arg = {}
+  # Populate positional arguments.
+  for name, arg in zip(fn_arg_names[:len(args)], args):
+    name_to_arg[name] = arg
+  # Populate keyword arguments.
+  name_to_arg.update(kwargs)
+
+  # Separate the Tensor arguments from the non-Tensor arguments.
+  # The default is that all arguments are Tensor arguments.
+  tensor_arg_names = tensor_arg_names or fn_arg_names
+  for name in tensor_arg_names:
+    if name not in name_to_arg:
+      raise ValueError("Must provide Tensor argument %s" % name)
+  tensor_args = [name_to_arg[name] for name in tensor_arg_names]
+  non_tensor_kwargs = dict([(name, arg) for name, arg in name_to_arg.items()
+                            if name not in tensor_arg_names])
+
+  # Check that Tensor arguments are in fact Tensors and that non-Tensor
+  # arguments are not.
+  for name, arg in zip(tensor_arg_names, tensor_args):
+    if not isinstance(arg, framework_ops.Tensor):
+      raise TypeError("Fn argument %s must be a Tensor." % name)
+  for name, arg in non_tensor_kwargs.items():
+    if isinstance(arg, framework_ops.Tensor):
+      raise TypeError("Fn argument %s must not be a Tensor." % name)
+
+  # Construct a Tensor-only wrapper function that will pass the non-Tensor
+  # arguments as well when called.
+  def tensor_only_fn(*tensors):
+    all_kwargs = dict(zip(tensor_arg_names, tensors))
+    all_kwargs.update(non_tensor_kwargs)
+    return fn(**all_kwargs)
+
+  return tensor_only_fn, tensor_args
+
+
+def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT,
+                    tupleize_grads=False):
   """See recompute_grad."""
   for arg in args:
     if not isinstance(arg, framework_ops.Tensor):
       raise ValueError("All inputs to function must be Tensors")
+
   use_data_dep_ = use_data_dep
   if use_data_dep_ == _USE_DEFAULT:
     use_data_dep_ = _is_on_tpu()
@@ -501,14 +559,11 @@ def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
     grad_vars = grads[len(inputs):]
     return grad_inputs, grad_vars
 
+  # TODO(rsepassi): Replace with tf.custom_gradient
   @_fn_with_custom_grad(grad_fn)
   def fn_with_recompute(*args):
     cached_vs.append(variable_scope.get_variable_scope())
-    # TODO(rsepassi): Rm conditional in TF 1.4
-    if hasattr(contrib_framework_ops, "current_arg_scope"):
-      cached_arg_scope.append(contrib_framework_ops.current_arg_scope())
-    else:
-      cached_arg_scope.append({})
+    cached_arg_scope.append(contrib_framework_ops.current_arg_scope())
     return fn(*args)
 
   return fn_with_recompute(*args)
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
index 392a490be1..66ccc696f9 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
@@ -318,6 +318,108 @@ class RecomputeTest(test.TestCase):
       self.assertEqual(1, len(grads))
       self.assertTrue(grads[0] is not None)
 
+  def testWithNontensorArgs(self):
+    @rev_block_lib.recompute_grad(tupleize_grads=True,
+                                  tensor_arg_names=["inputs"])
+    def layer_with_recompute(inputs, plus=None):
+      var = variable_scope.get_variable("var", ())
+      self.assertFalse(plus)  # called with False below
+      if plus:
+        return var + inputs
+      else:
+        return var * inputs
+
+    inputs = array_ops.ones((), dtypes.float32)
+    outputs = layer_with_recompute(inputs, plus=False)
+    loss = math_ops.square(outputs)
+    grads = gradients_impl.gradients(loss, variables.trainable_variables())
+    self.assertEqual(1, len(grads))
+    self.assertTrue(grads[0] is not None)
+
+
+class MakeTensorOnlyTest(test.TestCase):
+
+  def testMakeTensorOnly(self):
+    def fn(a, b, c, d=1, e=None, f=7):
+      return (a, b, c, d, e, f)
+
+    t1 = array_ops.ones(())
+    t2 = array_ops.ones(())
+    t3 = array_ops.ones(())
+    args = [1, t1, 3, t2]
+    kwargs = {"e": t3}
+    tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only(
+        fn, args, kwargs, ["b", "d", "e"])
+    self.assertAllEqual(tensor_args, [t1, t2, t3])
+    out = tensor_only_fn(*tensor_args)
+    self.assertAllEqual(out, (1, t1, 3, t2, t3, 7))
+
+  def testMakeTensorOnlyPositionalArgsOnly(self):
+    def fn(a, b, c):
+      return (a, b, c)
+
+    t1 = array_ops.ones(())
+    t2 = array_ops.ones(())
+    args = [t1, 3, t2]
+    tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only(
+        fn, args, {}, ["a", "c"])
+    self.assertAllEqual(tensor_args, [t1, t2])
+    out = tensor_only_fn(*tensor_args)
+    self.assertAllEqual(out, (t1, 3, t2))
+
+  def testMakeTensorOnlyKwargsArgsOnly(self):
+    def fn(a=1, b=2, c=3):
+      return (a, b, c)
+
+    t1 = array_ops.ones(())
+    t2 = array_ops.ones(())
+    args = [t1]
+    kwargs = {"c": t2}
+    tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only(
+        fn, args, kwargs, ["a", "c"])
+    self.assertAllEqual(tensor_args, [t1, t2])
+    out = tensor_only_fn(*tensor_args)
+    self.assertAllEqual(out, (t1, 2, t2))
+
+  def testErrorOnMissingTensorArg(self):
+    def fn(a, b):
+      return (a, b)
+
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "provide Tensor argument"):
+      rev_block_lib._make_tensor_only(fn, [], {"b": 2}, ["a"])
+
+  def testErrorOnSignatureSplats(self):
+    def fn1(a, *args):
+      return (a, args)
+
+    err_msg = r"must not use \*args or \*\*kwargs"
+    with self.assertRaisesWithPredicateMatch(ValueError, err_msg):
+      rev_block_lib._make_tensor_only(fn1, [1, 2], {}, ["a"])
+
+    def fn2(a, **kwargs):
+      return (a, kwargs)
+
+    with self.assertRaisesWithPredicateMatch(ValueError, err_msg):
+      rev_block_lib._make_tensor_only(fn2, [], {"a": 1, "b": 2}, ["a"])
+
+  def testErrorOnNonTensorForTensor(self):
+    def fn(a, b):
+      return (a, b)
+
+    with self.assertRaisesWithPredicateMatch(TypeError, "must be a Tensor"):
+      rev_block_lib._make_tensor_only(fn, [2, 3], {}, ["a"])
+
+  def testErrorOnTensorForNonTensor(self):
+    def fn(a, b):
+      return (a, b)
+
+    with self.assertRaisesWithPredicateMatch(
+        TypeError, "must not be a Tensor"):
+      t1 = array_ops.ones(())
+      t2 = array_ops.ones(())
+      rev_block_lib._make_tensor_only(fn, [t1, t2], {}, ["a"])
+
 
 class FnWithCustomGradTest(test.TestCase):
 
-- 
GitLab


From 13a7e9820a800cf3877e5a44b9f654f79808a2d4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 17:27:04 -0700
Subject: [PATCH 194/450] Update DecodeProtoOp so that it returns explicitly
 specified default values for missing fields.

PiperOrigin-RevId: 193600735
---
 .../kernel_tests/defaut_values.TestCase.pbtxt |  94 +++++++++
 .../promote_unsigned.TestCase.pbtxt           |  10 +-
 .../python/kernel_tests/test_example.proto    |  33 +++
 tensorflow/core/kernels/decode_proto_op.cc    | 188 +++++++++++++++---
 4 files changed, 300 insertions(+), 25 deletions(-)
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/defaut_values.TestCase.pbtxt

diff --git a/tensorflow/contrib/proto/python/kernel_tests/defaut_values.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/defaut_values.TestCase.pbtxt
new file mode 100644
index 0000000000..4e31681907
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/defaut_values.TestCase.pbtxt
@@ -0,0 +1,94 @@
+primitive {
+  # No fields specified, so we get all defaults
+}
+shape: 1
+sizes: 0
+field {
+  name: "double_default"
+  dtype: DT_DOUBLE
+  expected { double_value: 1.0 }
+}
+sizes: 0
+field {
+  name: "float_default"
+  dtype: DT_DOUBLE  # Try casting the float field to double.
+  expected { double_value: 2.0 }
+}
+sizes: 0
+field {
+  name: "int64_default"
+  dtype: DT_INT64
+  expected { int64_value: 3 }
+}
+sizes: 0
+field {
+  name: "uint64_default"
+  dtype: DT_INT64
+  expected { int64_value: 4 }
+}
+sizes: 0
+field {
+  name: "int32_default"
+  dtype: DT_INT32
+  expected { int32_value: 5 }
+}
+sizes: 0
+field {
+  name: "fixed64_default"
+  dtype: DT_INT64
+  expected { int64_value: 6 }
+}
+sizes: 0
+field {
+  name: "fixed32_default"
+  dtype: DT_INT32
+  expected { int32_value: 7 }
+}
+sizes: 0
+field {
+  name: "bool_default"
+  dtype: DT_BOOL
+  expected { bool_value: true }
+}
+sizes: 0
+field {
+  name: "string_default"
+  dtype: DT_STRING
+  expected { string_value: "a" }
+}
+sizes: 0
+field {
+  name: "bytes_default"
+  dtype: DT_STRING
+  expected { string_value: "a longer default string" }
+}
+sizes: 0
+field {
+  name: "uint32_default"
+  dtype: DT_INT32
+  expected { int32_value: -1 }
+}
+sizes: 0
+field {
+  name: "sfixed32_default"
+  dtype: DT_INT32
+  expected { int32_value: 10 }
+}
+sizes: 0
+field {
+  name: "sfixed64_default"
+  dtype: DT_INT64
+  expected { int64_value: 11 }
+}
+sizes: 0
+field {
+  name: "sint32_default"
+  dtype: DT_INT32
+  expected { int32_value: 12 }
+}
+sizes: 0
+field {
+  name: "sint64_default"
+  dtype: DT_INT64
+  expected { int64_value: 13 }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
index db7555bf2d..bc07efc8f3 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
+++ b/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
@@ -4,7 +4,6 @@ primitive {
 }
 shape: 1
 sizes: 1
-sizes: 1
 field {
   name: "fixed32_value"
   dtype: DT_INT64
@@ -12,6 +11,7 @@ field {
     int64_value: 4294967295
   }
 }
+sizes: 1
 field {
   name: "uint32_value"
   dtype: DT_INT64
@@ -19,3 +19,11 @@ field {
     int64_value: 4294967295
   }
 }
+sizes: 0
+field {
+  name: "uint32_default"
+  dtype: DT_INT64
+  expected {
+    int64_value: 4294967295  # Comes from an explicitly-specified default
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/test_example.proto b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
index dc495034ff..a2c88e372b 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
+++ b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
@@ -72,6 +72,23 @@ message RepeatedPrimitiveValue {
   repeated sint32 sint32_value = 17;
   repeated sint64 sint64_value = 18;
   repeated PrimitiveValue message_value = 19;
+
+  // Optional fields with explicitly-specified defaults.
+  optional double double_default = 20 [default = 1.0];
+  optional float float_default = 21 [default = 2.0];
+  optional int64 int64_default = 22 [default = 3];
+  optional uint64 uint64_default = 23 [default = 4];
+  optional int32 int32_default = 24 [default = 5];
+  optional fixed64 fixed64_default = 25 [default = 6];
+  optional fixed32 fixed32_default = 26 [default = 7];
+  optional bool bool_default = 27 [default = true];
+  optional string string_default = 28 [default = "a"];
+  optional bytes bytes_default = 29 [default = "a longer default string"];
+  optional uint32 uint32_default = 30 [default = 4294967295];
+  optional sfixed32 sfixed32_default = 31 [default = 10];
+  optional sfixed64 sfixed64_default = 32 [default = 11];
+  optional sint32 sint32_default = 33 [default = 12];
+  optional sint64 sint64_default = 34 [default = 13];
 }
 
 // A PackedPrimitiveValue looks exactly the same as a RepeatedPrimitiveValue
@@ -97,6 +114,22 @@ message PackedPrimitiveValue {
   repeated sint32 sint32_value = 17 [packed = true];
   repeated sint64 sint64_value = 18 [packed = true];
   repeated PrimitiveValue message_value = 19;
+
+  optional double double_default = 20 [default = 1.0];
+  optional float float_default = 21 [default = 2.0];
+  optional int64 int64_default = 22 [default = 3];
+  optional uint64 uint64_default = 23 [default = 4];
+  optional int32 int32_default = 24 [default = 5];
+  optional fixed64 fixed64_default = 25 [default = 6];
+  optional fixed32 fixed32_default = 26 [default = 7];
+  optional bool bool_default = 27 [default = true];
+  optional string string_default = 28 [default = "a"];
+  optional bytes bytes_default = 29 [default = "a longer default string"];
+  optional uint32 uint32_default = 30 [default = 4294967295];
+  optional sfixed32 sfixed32_default = 31 [default = 10];
+  optional sfixed64 sfixed64_default = 32 [default = 11];
+  optional sint32 sint32_default = 33 [default = 12];
+  optional sint64 sint64_default = 34 [default = 13];
 }
 
 message EnumValue {
diff --git a/tensorflow/core/kernels/decode_proto_op.cc b/tensorflow/core/kernels/decode_proto_op.cc
index b4e5b776ed..24f8a4f72f 100644
--- a/tensorflow/core/kernels/decode_proto_op.cc
+++ b/tensorflow/core/kernels/decode_proto_op.cc
@@ -105,11 +105,137 @@ bool CheckOutputType(FieldDescriptor::Type field_type, DataType output_type) {
   }
 }
 
+// Used to store the default value of a protocol message field, casted to the
+// type of the output tensor.
+//
+// TODO(paskin): Use absl::variant once TensorFlow gets absl dependencies.
+struct DefaultValue {
+  DataType dtype = DataType::DT_INVALID;
+  union Value {
+    bool v_bool;           // DT_BOOL
+    uint8 v_uint8;         // DT_UINT8
+    int8 v_int8;           // DT_INT8
+    int32 v_int32;         // DT_INT32
+    int64 v_int64;         // DT_INT64
+    float v_float;         // DT_FLOAT
+    double v_double;       // DT_DOUBLE
+    const char* v_string;  // DT_STRING
+  };
+  Value value;
+};
+
+// Initializes a DefaultValue object.  This generic template handles numeric
+// types and strings are handled by a template specialization below.
+//
+// Args:
+//   dtype: the type of the output tensor
+//   value: the default value as obtained from the FieldDescriptor
+//   result: the object to initialize
+template <typename T>
+Status InitDefaultValue(DataType dtype, const T value, DefaultValue* result) {
+  result->dtype = dtype;
+  switch (dtype) {
+    case DT_BOOL:
+      result->value.v_bool = static_cast<bool>(value);
+      break;
+    case DT_INT32:
+      result->value.v_int32 = static_cast<int32>(value);
+      break;
+    case DT_INT8:
+      result->value.v_int8 = static_cast<int8>(value);
+      break;
+    case DT_UINT8:
+      result->value.v_uint8 = static_cast<uint8>(value);
+      break;
+    case DT_INT64:
+      result->value.v_int64 = static_cast<int64>(value);
+      break;
+    case DT_FLOAT:
+      result->value.v_float = static_cast<float>(value);
+      break;
+    case DT_DOUBLE:
+      result->value.v_double = static_cast<double>(value);
+      break;
+    default:
+      // We should never get here, given the type checking that occurs earlier.
+      return errors::Internal(
+          "Cannot initialize default value for unsupported type: ",
+          DataTypeString(dtype));
+  }
+  return Status::OK();
+}
+
+template <>
+Status InitDefaultValue(DataType dtype, const char* value,
+                        DefaultValue* result) {
+  // These are sanity checks that should never trigger given the code that
+  // leads here.
+  if (TF_PREDICT_FALSE(dtype != DT_STRING)) {
+    return errors::InvalidArgument(
+        "Cannot cast field to anything but DT_STRING");
+  }
+  if (TF_PREDICT_FALSE(value == nullptr)) {
+    return errors::InvalidArgument("Null default string value.");
+  }
+  result->dtype = DT_STRING;
+  result->value.v_string = value;
+  return Status::OK();
+}
+
+// Initializes a default value from the output data type and the field
+// descriptor.
+Status InitDefaultValueFromFieldDescriptor(DataType dtype,
+                                           const FieldDescriptor* field_desc,
+                                           DefaultValue* result) {
+  switch (field_desc->type()) {
+    case WireFormatLite::TYPE_DOUBLE:
+      return InitDefaultValue(dtype, field_desc->default_value_double(),
+                              result);
+    case WireFormatLite::TYPE_FLOAT:
+      return InitDefaultValue(dtype, field_desc->default_value_float(), result);
+    case WireFormatLite::TYPE_INT64:
+    case WireFormatLite::TYPE_SINT64:
+    case WireFormatLite::TYPE_SFIXED64:
+      return InitDefaultValue(dtype, field_desc->default_value_int64(), result);
+    case WireFormatLite::TYPE_FIXED64:
+    case WireFormatLite::TYPE_UINT64:
+      return InitDefaultValue(dtype, field_desc->default_value_uint64(),
+                              result);
+    case WireFormatLite::TYPE_ENUM:
+    case WireFormatLite::TYPE_INT32:
+    case WireFormatLite::TYPE_SINT32:
+    case WireFormatLite::TYPE_SFIXED32:
+      return InitDefaultValue(dtype, field_desc->default_value_int32(), result);
+    case WireFormatLite::TYPE_FIXED32:
+    case WireFormatLite::TYPE_UINT32:
+      return InitDefaultValue(dtype, field_desc->default_value_uint32(),
+                              result);
+    case WireFormatLite::TYPE_BOOL:
+      return InitDefaultValue(dtype, field_desc->default_value_bool(), result);
+    case WireFormatLite::TYPE_BYTES:
+    case WireFormatLite::TYPE_STRING:
+      // Manipulating default string values as C-style pointers should be OK
+      // for typical code-generated protocol messages.  It is possible in
+      // principle to register a message descriptor on the fly, and these
+      // pointers may not be stable if that descriptor has a weird
+      // implementation.  (But the return type of default_value_string() is
+      // const string&, so it'd have to be very weird.)
+      return InitDefaultValue(dtype, field_desc->default_value_string().c_str(),
+                              result);
+    case WireFormatLite::TYPE_GROUP:
+    case WireFormatLite::TYPE_MESSAGE:
+      return InitDefaultValue(dtype, "", result);
+      // default: intentionally omitted in order to enable static checking.
+  }
+  return Status::OK();
+}
+
 // A FieldInfo holds a handful of information from the FieldDescriptor
 // and user attributes.
 struct FieldInfo {
-  FieldInfo(const FieldDescriptor* field_desc, int user_index)
-      : output_index(user_index) {
+  FieldInfo(const FieldDescriptor* field_desc, int user_index,
+            DefaultValue def_value)
+      : output_index(user_index), default_value(def_value) {
     // Without this intermediate data structure, the profile had hotspots
     // calling methods of FieldDescriptor.
     number = field_desc->number();
@@ -144,6 +270,7 @@ struct FieldInfo {
   WireFormatLite::FieldType type;
   int number;
   bool is_repeated;
+  DefaultValue default_value;
 };
 
 // A CountCollector counts sizes of repeated and optional fields in a proto.
@@ -394,8 +521,11 @@ class DenseCollector {
   DenseCollector() = default;
 
   // A DenseCollector applies to one field of a serialized message.
-  DenseCollector(uint8* datap, DataType dtype, int max_repeat_count)
-      : datap_(datap), dtype_(dtype), max_repeat_count_(max_repeat_count) {}
+  // Note that default_value.dtype is the type of the output tensor.
+  DenseCollector(uint8* datap, DefaultValue default_value, int max_repeat_count)
+      : datap_(datap),
+        default_value_(default_value),
+        max_repeat_count_(max_repeat_count) {}
 
   // Reads a value from the input stream and stores it.
   //
@@ -415,8 +545,8 @@ class DenseCollector {
     }
     next_repeat_index_ = index + 1;
 
-    return internal::ReadValue(input, field.type, field.number, dtype_, index,
-                               datap_);
+    return internal::ReadValue(input, field.type, field.number,
+                               default_value_.dtype, index, datap_);
   }
 
   // Reads and stores a length-delimited list of values.
@@ -445,8 +575,8 @@ class DenseCollector {
           field.number, ", Max entries allowed: ", max_repeat_count_);
     } else {
       return internal::ReadPackedFromArray(buf, buf_size, field.type,
-                                           field.number, dtype_, stride,
-                                           &next_repeat_index_, datap_);
+                                           field.number, default_value_.dtype,
+                                           stride, &next_repeat_index_, datap_);
     }
   }
 
@@ -454,23 +584,23 @@ class DenseCollector {
   // Dispatches to the appropriately typed field default based on the
   // runtime type tag.
   Status FillWithDefaults() {
-    switch (dtype_) {
+    switch (default_value_.dtype) {
       case DataType::DT_FLOAT:
-        return FillDefault<float>();
+        return FillDefault<float>(default_value_.value.v_float);
       case DataType::DT_DOUBLE:
-        return FillDefault<double>();
+        return FillDefault<double>(default_value_.value.v_double);
       case DataType::DT_INT32:
-        return FillDefault<int32>();
+        return FillDefault<int32>(default_value_.value.v_int32);
       case DataType::DT_UINT8:
-        return FillDefault<uint8>();
+        return FillDefault<uint8>(default_value_.value.v_uint8);
       case DataType::DT_INT8:
-        return FillDefault<int8>();
+        return FillDefault<int8>(default_value_.value.v_int8);
       case DataType::DT_STRING:
-        return FillDefault<string>();
+        return FillDefault<string>(default_value_.value.v_string);
       case DataType::DT_INT64:
-        return FillDefault<int64>();
+        return FillDefault<int64>(default_value_.value.v_int64);
       case DataType::DT_BOOL:
-        return FillDefault<bool>();
+        return FillDefault<bool>(default_value_.value.v_bool);
       default:
         // There are many tensorflow dtypes not handled here, but they
         // should not come up unless type casting is added to the Op.
@@ -485,9 +615,9 @@ class DenseCollector {
   // default value. This uses next_repeat_index_ which counts the number
   // of parsed values for the field.
   template <class T>
-  Status FillDefault() {
+  Status FillDefault(const T& default_value) {
     for (int i = next_repeat_index_; i < max_repeat_count_; i++) {
-      reinterpret_cast<T*>(datap_)[i] = T();
+      reinterpret_cast<T*>(datap_)[i] = default_value;
     }
     return Status::OK();
   }
@@ -501,7 +631,7 @@ class DenseCollector {
   // for more items than we have allocated space.
   void* const datap_ = nullptr;
 
-  const DataType dtype_ = DataType::DT_INVALID;
+  const DefaultValue default_value_;
   const int max_repeat_count_ = 0;
 };
 
@@ -577,8 +707,14 @@ class DecodeProtoOp : public OpKernel {
 
     // Now store the fields in sorted order.
     for (int i = 0; i < field_names.size(); i++) {
-      fields_.push_back(MakeUnique<FieldInfo>(field_descs[output_indices[i]],
-                                              output_indices[i]));
+      const int output_index = output_indices[i];
+      const DataType dtype = output_types[output_index];
+      const FieldDescriptor* field_descriptor = field_descs[output_index];
+      DefaultValue default_value;
+      OP_REQUIRES_OK(context, InitDefaultValueFromFieldDescriptor(
+                                  dtype, field_descriptor, &default_value));
+      fields_.push_back(
+          MakeUnique<FieldInfo>(field_descriptor, output_index, default_value));
     }
 
     message_prototype_ = message_factory_.GetPrototype(message_desc);
@@ -805,9 +941,13 @@ class DecodeProtoOp : public OpKernel {
 
       std::vector<DenseCollector> collectors;
       collectors.reserve(field_count);
-      for (const TensorInfo& info : tensors) {
+      for (int output_index = 0; output_index < field_count; ++output_index) {
+        const TensorInfo& info = tensors[output_index];
+        const FieldInfo* field_info = fields_[output_index].get();
+        DCHECK(field_info != nullptr);
+        const DefaultValue default_value = field_info->default_value;
         collectors.emplace_back(info.data + message_index * info.stride,
-                                info.dtype, info.last_dim_size);
+                                default_value, info.last_dim_size);
       }
 
       // Fill in output tensors from the wire.
-- 
GitLab


From 976229dcbfde389864069433ebfc4085015df9c1 Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Thu, 19 Apr 2018 17:30:49 -0700
Subject: [PATCH 195/450] Internal testing changes

PiperOrigin-RevId: 193601134
---
 tensorflow/contrib/lite/kernels/BUILD | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index 8cfa7e53d1..80cefe83b2 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -212,6 +212,7 @@ tf_cc_test(
     name = "audio_spectrogram_test",
     size = "small",
     srcs = ["audio_spectrogram_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -225,6 +226,7 @@ tf_cc_test(
     name = "mfcc_test",
     size = "small",
     srcs = ["mfcc_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -346,6 +348,7 @@ tf_cc_test(
     name = "cast_test",
     size = "small",
     srcs = ["cast_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -398,6 +401,7 @@ tf_cc_test(
     name = "dequantize_test",
     size = "small",
     srcs = ["dequantize_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -504,6 +508,7 @@ tf_cc_test(
     name = "maximum_minimum_test",
     size = "small",
     srcs = ["maximum_minimum_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
-- 
GitLab


From 7f87125dceb3c69c5fd1d0712c6c93cc4ceaa854 Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Thu, 19 Apr 2018 17:39:09 -0700
Subject: [PATCH 196/450] internal END_PUBLIC

BEGIN_PUBLIC
Automated g4 rollback of changelist 193571934

PiperOrigin-RevId: 193602050
---
 tensorflow/core/lib/io/record_reader.cc    | 147 ++++++++++----
 tensorflow/core/lib/io/record_reader.h     |  16 +-
 tensorflow/core/lib/io/recordio_test.cc    | 212 +++++++--------------
 tensorflow/core/lib/io/zlib_inputstream.cc |   9 +-
 tensorflow/core/lib/io/zlib_inputstream.h  |  10 +-
 5 files changed, 188 insertions(+), 206 deletions(-)

diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index c24628be57..6de850bb20 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -56,55 +56,110 @@ RecordReaderOptions RecordReaderOptions::CreateRecordReaderOptions(
 
 RecordReader::RecordReader(RandomAccessFile* file,
                            const RecordReaderOptions& options)
-    : options_(options),
-      input_stream_(new RandomAccessInputStream(file)),
-      last_read_failed_(false) {
+    : src_(file), options_(options) {
   if (options.buffer_size > 0) {
-    input_stream_.reset(new BufferedInputStream(input_stream_.release(),
-                                                options.buffer_size, true));
+    input_stream_.reset(new BufferedInputStream(file, options.buffer_size));
+  } else {
+    input_stream_.reset(new RandomAccessInputStream(file));
   }
   if (options.compression_type == RecordReaderOptions::ZLIB_COMPRESSION) {
 // We don't have zlib available on all embedded platforms, so fail.
 #if defined(IS_SLIM_BUILD)
     LOG(FATAL) << "Zlib compression is unsupported on mobile platforms.";
 #else   // IS_SLIM_BUILD
-    input_stream_.reset(new ZlibInputStream(
-        input_stream_.release(), options.zlib_options.input_buffer_size,
-        options.zlib_options.output_buffer_size, options.zlib_options, true));
+    zlib_input_stream_.reset(new ZlibInputStream(
+        input_stream_.get(), options.zlib_options.input_buffer_size,
+        options.zlib_options.output_buffer_size, options.zlib_options));
 #endif  // IS_SLIM_BUILD
   } else if (options.compression_type == RecordReaderOptions::NONE) {
     // Nothing to do.
   } else {
-    LOG(FATAL) << "Unrecognized compression type :" << options.compression_type;
+    LOG(FATAL) << "Unspecified compression type :" << options.compression_type;
   }
 }
 
 // Read n+4 bytes from file, verify that checksum of first n bytes is
 // stored in the last 4 bytes and store the first n bytes in *result.
-//
-// offset corresponds to the user-provided value to ReadRecord()
-// and is used only in error messages.
-Status RecordReader::ReadChecksummed(uint64 offset, size_t n, string* result) {
+// May use *storage as backing store.
+Status RecordReader::ReadChecksummed(uint64 offset, size_t n,
+                                     StringPiece* result, string* storage) {
   if (n >= SIZE_MAX - sizeof(uint32)) {
     return errors::DataLoss("record size too large");
   }
 
   const size_t expected = n + sizeof(uint32);
-  TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, result));
+  storage->resize(expected);
+
+#if !defined(IS_SLIM_BUILD)
+  if (zlib_input_stream_) {
+    // If we have a zlib compressed buffer, we assume that the
+    // file is being read sequentially, and we use the underlying
+    // implementation to read the data.
+    //
+    // No checks are done to validate that the file is being read
+    // sequentially.  At some point the zlib input buffer may support
+    // seeking, possibly inefficiently.
+    TF_RETURN_IF_ERROR(zlib_input_stream_->ReadNBytes(expected, storage));
+
+    if (storage->size() != expected) {
+      if (storage->empty()) {
+        return errors::OutOfRange("eof");
+      } else {
+        return errors::DataLoss("truncated record at ", offset);
+      }
+    }
 
-  if (result->size() != expected) {
-    if (result->empty()) {
-      return errors::OutOfRange("eof");
+    uint32 masked_crc = core::DecodeFixed32(storage->data() + n);
+    if (crc32c::Unmask(masked_crc) != crc32c::Value(storage->data(), n)) {
+      return errors::DataLoss("corrupted record at ", offset);
+    }
+    *result = StringPiece(storage->data(), n);
+  } else {
+#endif  // IS_SLIM_BUILD
+    if (options_.buffer_size > 0) {
+      // If we have a buffer, we assume that the file is being read
+      // sequentially, and we use the underlying implementation to read the
+      // data.
+      //
+      // No checks are done to validate that the file is being read
+      // sequentially.
+      TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, storage));
+
+      if (storage->size() != expected) {
+        if (storage->empty()) {
+          return errors::OutOfRange("eof");
+        } else {
+          return errors::DataLoss("truncated record at ", offset);
+        }
+      }
+
+      const uint32 masked_crc = core::DecodeFixed32(storage->data() + n);
+      if (crc32c::Unmask(masked_crc) != crc32c::Value(storage->data(), n)) {
+        return errors::DataLoss("corrupted record at ", offset);
+      }
+      *result = StringPiece(storage->data(), n);
     } else {
-      return errors::DataLoss("truncated record at ", offset);
+      // This version supports reading from arbitrary offsets
+      // since we are accessing the random access file directly.
+      StringPiece data;
+      TF_RETURN_IF_ERROR(src_->Read(offset, expected, &data, &(*storage)[0]));
+      if (data.size() != expected) {
+        if (data.empty()) {
+          return errors::OutOfRange("eof");
+        } else {
+          return errors::DataLoss("truncated record at ", offset);
+        }
+      }
+      const uint32 masked_crc = core::DecodeFixed32(data.data() + n);
+      if (crc32c::Unmask(masked_crc) != crc32c::Value(data.data(), n)) {
+        return errors::DataLoss("corrupted record at ", offset);
+      }
+      *result = StringPiece(data.data(), n);
     }
+#if !defined(IS_SLIM_BUILD)
   }
+#endif  // IS_SLIM_BUILD
 
-  const uint32 masked_crc = core::DecodeFixed32(result->data() + n);
-  if (crc32c::Unmask(masked_crc) != crc32c::Value(result->data(), n)) {
-    return errors::DataLoss("corrupted record at ", offset);
-  }
-  result->resize(n);
   return Status::OK();
 }
 
@@ -112,42 +167,50 @@ Status RecordReader::ReadRecord(uint64* offset, string* record) {
   static const size_t kHeaderSize = sizeof(uint64) + sizeof(uint32);
   static const size_t kFooterSize = sizeof(uint32);
 
-  // Position the input stream.
-  int64 curr_pos = input_stream_->Tell();
-  int64 desired_pos = static_cast<int64>(*offset);
-  if (curr_pos > desired_pos || curr_pos < 0 /* EOF */ ||
-      (curr_pos == desired_pos && last_read_failed_)) {
-    last_read_failed_ = false;
-    TF_RETURN_IF_ERROR(input_stream_->Reset());
-    TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos));
-  } else if (curr_pos < desired_pos) {
-    TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos - curr_pos));
-  }
-  DCHECK_EQ(desired_pos, input_stream_->Tell());
-
   // Read header data.
-  Status s = ReadChecksummed(*offset, sizeof(uint64), record);
+  StringPiece lbuf;
+  Status s = ReadChecksummed(*offset, sizeof(uint64), &lbuf, record);
   if (!s.ok()) {
-    last_read_failed_ = true;
     return s;
   }
-  const uint64 length = core::DecodeFixed64(record->data());
+  const uint64 length = core::DecodeFixed64(lbuf.data());
 
   // Read data
-  s = ReadChecksummed(*offset + kHeaderSize, length, record);
+  StringPiece data;
+  s = ReadChecksummed(*offset + kHeaderSize, length, &data, record);
   if (!s.ok()) {
-    last_read_failed_ = true;
     if (errors::IsOutOfRange(s)) {
       s = errors::DataLoss("truncated record at ", *offset);
     }
     return s;
   }
 
+  if (record->data() != data.data()) {
+    // RandomAccessFile placed the data in some other location.
+    memmove(&(*record)[0], data.data(), data.size());
+  }
+
+  record->resize(data.size());
+
   *offset += kHeaderSize + length + kFooterSize;
-  DCHECK_EQ(*offset, input_stream_->Tell());
   return Status::OK();
 }
 
+Status RecordReader::SkipNBytes(uint64 offset) {
+#if !defined(IS_SLIM_BUILD)
+  if (zlib_input_stream_) {
+    TF_RETURN_IF_ERROR(zlib_input_stream_->SkipNBytes(offset));
+  } else {
+#endif
+    if (options_.buffer_size > 0) {
+      TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(offset));
+    }
+#if !defined(IS_SLIM_BUILD)
+  }
+#endif
+  return Status::OK();
+}  // namespace io
+
 SequentialRecordReader::SequentialRecordReader(
     RandomAccessFile* file, const RecordReaderOptions& options)
     : underlying_(file, options), offset_(0) {}
diff --git a/tensorflow/core/lib/io/record_reader.h b/tensorflow/core/lib/io/record_reader.h
index f6d587dfa0..26278e0328 100644
--- a/tensorflow/core/lib/io/record_reader.h
+++ b/tensorflow/core/lib/io/record_reader.h
@@ -69,14 +69,25 @@ class RecordReader {
   // Read the record at "*offset" into *record and update *offset to
   // point to the offset of the next record.  Returns OK on success,
   // OUT_OF_RANGE for end of file, or something else for an error.
+  //
+  // Note: if buffering is used (with or without compression), access must be
+  // sequential.
   Status ReadRecord(uint64* offset, string* record);
 
+  // Skip the records till "offset". Returns OK on success,
+  // OUT_OF_RANGE for end of file, or something else for an error.
+  Status SkipNBytes(uint64 offset);
+
  private:
-  Status ReadChecksummed(uint64 offset, size_t n, string* result);
+  Status ReadChecksummed(uint64 offset, size_t n, StringPiece* result,
+                         string* storage);
 
+  RandomAccessFile* src_;
   RecordReaderOptions options_;
   std::unique_ptr<InputStreamInterface> input_stream_;
-  bool last_read_failed_;
+#if !defined(IS_SLIM_BUILD)
+  std::unique_ptr<ZlibInputStream> zlib_input_stream_;
+#endif  // IS_SLIM_BUILD
 
   TF_DISALLOW_COPY_AND_ASSIGN(RecordReader);
 };
@@ -110,6 +121,7 @@ class SequentialRecordReader {
       return errors::InvalidArgument(
           "Trying to seek offset: ", offset,
           " which is less than the current offset: ", offset_);
+    TF_RETURN_IF_ERROR(underlying_.SkipNBytes(offset - offset_));
     offset_ = offset;
     return Status::OK();
   }
diff --git a/tensorflow/core/lib/io/recordio_test.cc b/tensorflow/core/lib/io/recordio_test.cc
index da514bd21c..63235761d9 100644
--- a/tensorflow/core/lib/io/recordio_test.cc
+++ b/tensorflow/core/lib/io/recordio_test.cc
@@ -26,11 +26,10 @@ limitations under the License.
 
 namespace tensorflow {
 namespace io {
-namespace {
 
 // Construct a string of the specified length made out of the supplied
 // partial string.
-string BigString(const string& partial_string, size_t n) {
+static string BigString(const string& partial_string, size_t n) {
   string result;
   while (result.size() < n) {
     result.append(partial_string);
@@ -40,66 +39,62 @@ string BigString(const string& partial_string, size_t n) {
 }
 
 // Construct a string from a number
-string NumberString(int n) {
+static string NumberString(int n) {
   char buf[50];
   snprintf(buf, sizeof(buf), "%d.", n);
   return string(buf);
 }
 
 // Return a skewed potentially long string
-string RandomSkewedString(int i, random::SimplePhilox* rnd) {
+static string RandomSkewedString(int i, random::SimplePhilox* rnd) {
   return BigString(NumberString(i), rnd->Skewed(17));
 }
 
-class StringDest : public WritableFile {
- public:
-  explicit StringDest(string* contents) : contents_(contents) {}
-
-  Status Close() override { return Status::OK(); }
-  Status Flush() override { return Status::OK(); }
-  Status Sync() override { return Status::OK(); }
-  Status Append(const StringPiece& slice) override {
-    contents_->append(slice.data(), slice.size());
-    return Status::OK();
-  }
-
+class RecordioTest : public ::testing::Test {
  private:
-  string* contents_;
-};
-
-class StringSource : public RandomAccessFile {
- public:
-  explicit StringSource(string* contents)
-      : contents_(contents), force_error_(false) {}
-
-  Status Read(uint64 offset, size_t n, StringPiece* result,
-              char* scratch) const override {
-    if (force_error_) {
-      force_error_ = false;
-      return errors::DataLoss("read error");
+  class StringDest : public WritableFile {
+   public:
+    string contents_;
+
+    Status Close() override { return Status::OK(); }
+    Status Flush() override { return Status::OK(); }
+    Status Sync() override { return Status::OK(); }
+    Status Append(const StringPiece& slice) override {
+      contents_.append(slice.data(), slice.size());
+      return Status::OK();
     }
-
-    if (offset >= contents_->size()) {
-      return errors::OutOfRange("end of file");
-    }
-
-    if (contents_->size() < offset + n) {
-      n = contents_->size() - offset;
+  };
+
+  class StringSource : public RandomAccessFile {
+   public:
+    StringPiece contents_;
+    mutable bool force_error_;
+    mutable bool returned_partial_;
+    StringSource() : force_error_(false), returned_partial_(false) {}
+
+    Status Read(uint64 offset, size_t n, StringPiece* result,
+                char* scratch) const override {
+      EXPECT_FALSE(returned_partial_) << "must not Read() after eof/error";
+
+      if (force_error_) {
+        force_error_ = false;
+        returned_partial_ = true;
+        return errors::DataLoss("read error");
+      }
+
+      if (offset >= contents_.size()) {
+        return errors::OutOfRange("end of file");
+      }
+
+      if (contents_.size() < offset + n) {
+        n = contents_.size() - offset;
+        returned_partial_ = true;
+      }
+      *result = StringPiece(contents_.data() + offset, n);
+      return Status::OK();
     }
-    *result = StringPiece(contents_->data() + offset, n);
-    return Status::OK();
-  }
-
-  void force_error() { force_error_ = true; }
-
- private:
-  string* contents_;
-  mutable bool force_error_;
-};
+  };
 
-class RecordioTest : public ::testing::Test {
- private:
-  string contents_;
   StringDest dest_;
   StringSource source_;
   bool reading_;
@@ -109,9 +104,7 @@ class RecordioTest : public ::testing::Test {
 
  public:
   RecordioTest()
-      : dest_(&contents_),
-        source_(&contents_),
-        reading_(false),
+      : reading_(false),
         readpos_(0),
         writer_(new RecordWriter(&dest_)),
         reader_(new RecordReader(&source_)) {}
@@ -126,11 +119,12 @@ class RecordioTest : public ::testing::Test {
     TF_ASSERT_OK(writer_->WriteRecord(StringPiece(msg)));
   }
 
-  size_t WrittenBytes() const { return contents_.size(); }
+  size_t WrittenBytes() const { return dest_.contents_.size(); }
 
   string Read() {
     if (!reading_) {
       reading_ = true;
+      source_.contents_ = StringPiece(dest_.contents_);
     }
     string record;
     Status s = reader_->ReadRecord(&readpos_, &record);
@@ -143,20 +137,26 @@ class RecordioTest : public ::testing::Test {
     }
   }
 
-  void IncrementByte(int offset, int delta) { contents_[offset] += delta; }
+  void IncrementByte(int offset, int delta) {
+    dest_.contents_[offset] += delta;
+  }
 
-  void SetByte(int offset, char new_byte) { contents_[offset] = new_byte; }
+  void SetByte(int offset, char new_byte) {
+    dest_.contents_[offset] = new_byte;
+  }
 
-  void ShrinkSize(int bytes) { contents_.resize(contents_.size() - bytes); }
+  void ShrinkSize(int bytes) {
+    dest_.contents_.resize(dest_.contents_.size() - bytes);
+  }
 
   void FixChecksum(int header_offset, int len) {
     // Compute crc of type/len/data
-    uint32_t crc = crc32c::Value(&contents_[header_offset + 6], 1 + len);
+    uint32_t crc = crc32c::Value(&dest_.contents_[header_offset + 6], 1 + len);
     crc = crc32c::Mask(crc);
-    core::EncodeFixed32(&contents_[header_offset], crc);
+    core::EncodeFixed32(&dest_.contents_[header_offset], crc);
   }
 
-  void ForceError() { source_.force_error(); }
+  void ForceError() { source_.force_error_ = true; }
 
   void StartReadingAt(uint64_t initial_offset) { readpos_ = initial_offset; }
 
@@ -165,6 +165,7 @@ class RecordioTest : public ::testing::Test {
     Write("bar");
     Write(BigString("x", 10000));
     reading_ = true;
+    source_.contents_ = StringPiece(dest_.contents_);
     uint64 offset = WrittenBytes() + offset_past_end;
     string record;
     Status s = reader_->ReadRecord(&offset, &record);
@@ -216,100 +217,16 @@ TEST_F(RecordioTest, RandomRead) {
   ASSERT_EQ("EOF", Read());
 }
 
-void TestNonSequentialReads(const RecordWriterOptions& writer_options,
-                            const RecordReaderOptions& reader_options) {
-  string contents;
-  StringDest dst(&contents);
-  RecordWriter writer(&dst, writer_options);
-  for (int i = 0; i < 10; ++i) {
-    TF_ASSERT_OK(writer.WriteRecord(NumberString(i))) << i;
-  }
-  TF_ASSERT_OK(writer.Close());
-
-  StringSource file(&contents);
-  RecordReader reader(&file, reader_options);
-
-  string record;
-  // First read sequentially to fill in the offsets table.
-  uint64 offsets[10] = {0};
-  uint64 offset = 0;
-  for (int i = 0; i < 10; ++i) {
-    offsets[i] = offset;
-    TF_ASSERT_OK(reader.ReadRecord(&offset, &record)) << i;
-  }
-
-  // Read randomly: First go back to record #3 then forward to #8.
-  offset = offsets[3];
-  TF_ASSERT_OK(reader.ReadRecord(&offset, &record));
-  EXPECT_EQ("3.", record);
-  EXPECT_EQ(offsets[4], offset);
-
-  offset = offsets[8];
-  TF_ASSERT_OK(reader.ReadRecord(&offset, &record));
-  EXPECT_EQ("8.", record);
-  EXPECT_EQ(offsets[9], offset);
-}
-
-TEST_F(RecordioTest, NonSequentialReads) {
-  TestNonSequentialReads(RecordWriterOptions(), RecordReaderOptions());
-}
-
-TEST_F(RecordioTest, NonSequentialReadsWithReadBuffer) {
-  RecordReaderOptions options;
-  options.buffer_size = 1 << 10;
-  TestNonSequentialReads(RecordWriterOptions(), options);
-}
-
-TEST_F(RecordioTest, NonSequentialReadsWithCompression) {
-  TestNonSequentialReads(
-      RecordWriterOptions::CreateRecordWriterOptions("ZLIB"),
-      RecordReaderOptions::CreateRecordReaderOptions("ZLIB"));
-}
-
 // Tests of all the error paths in log_reader.cc follow:
-void AssertHasSubstr(StringPiece s, StringPiece expected) {
+static void AssertHasSubstr(StringPiece s, StringPiece expected) {
   EXPECT_TRUE(str_util::StrContains(s, expected))
       << s << " does not contain " << expected;
 }
 
-void TestReadError(const RecordWriterOptions& writer_options,
-                   const RecordReaderOptions& reader_options) {
-  const string wrote = BigString("well hello there!", 100);
-  string contents;
-  StringDest dst(&contents);
-  TF_ASSERT_OK(RecordWriter(&dst, writer_options).WriteRecord(wrote));
-
-  StringSource file(&contents);
-  RecordReader reader(&file, reader_options);
-
-  uint64 offset = 0;
-  string read;
-  file.force_error();
-  Status status = reader.ReadRecord(&offset, &read);
-  ASSERT_TRUE(errors::IsDataLoss(status));
-  ASSERT_EQ(0, offset);
-
-  // A failed Read() shouldn't update the offset, and thus a retry shouldn't
-  // lose the record.
-  status = reader.ReadRecord(&offset, &read);
-  ASSERT_TRUE(status.ok()) << status;
-  EXPECT_GT(offset, 0);
-  EXPECT_EQ(wrote, read);
-}
-
 TEST_F(RecordioTest, ReadError) {
-  TestReadError(RecordWriterOptions(), RecordReaderOptions());
-}
-
-TEST_F(RecordioTest, ReadErrorWithBuffering) {
-  RecordReaderOptions options;
-  options.buffer_size = 1 << 20;
-  TestReadError(RecordWriterOptions(), options);
-}
-
-TEST_F(RecordioTest, ReadErrorWithCompression) {
-  TestReadError(RecordWriterOptions::CreateRecordWriterOptions("ZLIB"),
-                RecordReaderOptions::CreateRecordReaderOptions("ZLIB"));
+  Write("foo");
+  ForceError();
+  AssertHasSubstr(Read(), "Data loss");
 }
 
 TEST_F(RecordioTest, CorruptLength) {
@@ -340,6 +257,5 @@ TEST_F(RecordioTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); }
 
 TEST_F(RecordioTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); }
 
-}  // namespace
 }  // namespace io
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/zlib_inputstream.cc b/tensorflow/core/lib/io/zlib_inputstream.cc
index bf8dcf0988..984fbc2810 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.cc
+++ b/tensorflow/core/lib/io/zlib_inputstream.cc
@@ -25,9 +25,8 @@ ZlibInputStream::ZlibInputStream(
     InputStreamInterface* input_stream,
     size_t input_buffer_bytes,   // size of z_stream.next_in buffer
     size_t output_buffer_bytes,  // size of z_stream.next_out buffer
-    const ZlibCompressionOptions& zlib_options, bool owns_input_stream)
-    : owns_input_stream_(owns_input_stream),
-      input_stream_(input_stream),
+    const ZlibCompressionOptions& zlib_options)
+    : input_stream_(input_stream),
       input_buffer_capacity_(input_buffer_bytes),
       output_buffer_capacity_(output_buffer_bytes),
       z_stream_input_(new Bytef[input_buffer_capacity_]),
@@ -42,14 +41,10 @@ ZlibInputStream::~ZlibInputStream() {
   if (z_stream_) {
     inflateEnd(z_stream_.get());
   }
-  if (owns_input_stream_) {
-    delete input_stream_;
-  }
 }
 
 Status ZlibInputStream::Reset() {
   TF_RETURN_IF_ERROR(input_stream_->Reset());
-  inflateEnd(z_stream_.get());
   InitZlibBuffer();
   bytes_read_ = 0;
   return Status::OK();
diff --git a/tensorflow/core/lib/io/zlib_inputstream.h b/tensorflow/core/lib/io/zlib_inputstream.h
index 6099e2455d..9c7e14441c 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.h
+++ b/tensorflow/core/lib/io/zlib_inputstream.h
@@ -40,13 +40,10 @@ class ZlibInputStream : public InputStreamInterface {
   // Create a ZlibInputStream for `input_stream` with a buffer of size
   // `input_buffer_bytes` bytes for reading contents from `input_stream` and
   // another buffer with size `output_buffer_bytes` for caching decompressed
-  // contents.
-  //
-  // Takes ownership of `input_stream` iff `owns_input_stream` is true.
+  // contents. Does *not* take ownership of "input_stream".
   ZlibInputStream(InputStreamInterface* input_stream, size_t input_buffer_bytes,
                   size_t output_buffer_bytes,
-                  const ZlibCompressionOptions& zlib_options,
-                  bool owns_input_stream = false);
+                  const ZlibCompressionOptions& zlib_options);
 
   ~ZlibInputStream();
 
@@ -68,8 +65,7 @@ class ZlibInputStream : public InputStreamInterface {
  private:
   void InitZlibBuffer();
 
-  const bool owns_input_stream_;
-  InputStreamInterface* input_stream_;
+  InputStreamInterface* input_stream_;  // Not owned
   size_t input_buffer_capacity_;        // Size of z_stream_input_
   size_t output_buffer_capacity_;       // Size of z_stream_output_
   char* next_unread_byte_;              // Next unread byte in z_stream_output_
-- 
GitLab


From b7cca088e90b4c2a28c1038980aa09240584e382 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 19 Apr 2018 18:12:57 -0700
Subject: [PATCH 197/450] Respect any device filters in
 {Create,Delete}WorkerSessions().

This is another step towards enabling us to turn on explicit worker
sessions for all master sessions.

PiperOrigin-RevId: 193605565
---
 tensorflow/core/distributed_runtime/master.cc            | 6 +++++-
 tensorflow/core/distributed_runtime/master_env.h         | 3 ++-
 tensorflow/core/distributed_runtime/master_session.cc    | 9 +++++----
 tensorflow/core/distributed_runtime/master_session.h     | 6 +++++-
 .../core/distributed_runtime/rpc/grpc_server_lib.cc      | 4 +++-
 5 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index f47502e844..288656e7f8 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -417,9 +417,13 @@ void Master::CreateSession(const CreateSessionRequest* req,
     SessionOptions options;
     options.config = req->config();
 
+    std::vector<string> filtered_worker_list;
+    DeviceFinder::GetRemoteWorkers(req->config().device_filters(), env_,
+                                   worker_cache, &filtered_worker_list);
+
     MasterSession* session = env_->master_session_factory(
         options, env_, std::move(remote_devices), std::move(worker_cache_ptr),
-        std::move(device_set));
+        std::move(device_set), std::move(filtered_worker_list));
 
     GraphDef* gdef =
         const_cast<CreateSessionRequest*>(req)->mutable_graph_def();
diff --git a/tensorflow/core/distributed_runtime/master_env.h b/tensorflow/core/distributed_runtime/master_env.h
index 178c5b40ee..16f4d93c8b 100644
--- a/tensorflow/core/distributed_runtime/master_env.h
+++ b/tensorflow/core/distributed_runtime/master_env.h
@@ -83,7 +83,8 @@ struct MasterEnv {
       SessionOptions, MasterEnv*,
       std::unique_ptr<std::vector<std::unique_ptr<Device>>>,
       std::unique_ptr<WorkerCacheInterface>,
-      std::unique_ptr<DeviceSet> device_set)>
+      std::unique_ptr<DeviceSet> device_set,
+      std::vector<string> filtered_worker_list)>
       master_session_factory;
 
   std::function<Status(const WorkerCacheFactoryOptions&,
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 7868200fb4..ebe350d313 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -416,6 +416,7 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
   if (!s.ok()) {
     for (Part& part : partitions_) {
       worker_cache_->ReleaseWorker(part.name, part.worker);
+      part.worker = nullptr;
     }
     return s;
   }
@@ -1119,6 +1120,7 @@ MasterSession::MasterSession(
     std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
     std::unique_ptr<DeviceSet> device_set,
+    std::vector<string> filtered_worker_list,
     StatsPublisherFactory stats_publisher_factory)
     : session_opts_(opt),
       env_(env),
@@ -1126,6 +1128,7 @@ MasterSession::MasterSession(
       remote_devs_(std::move(remote_devs)),
       worker_cache_(std::move(worker_cache)),
       devices_(std::move(device_set)),
+      filtered_worker_list_(std::move(filtered_worker_list)),
       stats_publisher_factory_(std::move(stats_publisher_factory)),
       graph_version_(0),
       run_graphs_(5),
@@ -1183,9 +1186,8 @@ Status MasterSession::Create(GraphDef* graph_def,
 
 Status MasterSession::CreateWorkerSessions(
     const WorkerCacheFactoryOptions& options) {
-  std::vector<string> worker_names;
+  const std::vector<string> worker_names = filtered_worker_list_;
   WorkerCacheInterface* worker_cache = get_worker_cache();
-  worker_cache->ListWorkers(&worker_names);
 
   struct WorkerGroup {
     // The worker name. (Not owned.)
@@ -1263,8 +1265,7 @@ Status MasterSession::CreateWorkerSessions(
 
 Status MasterSession::DeleteWorkerSessions() {
   WorkerCacheInterface* worker_cache = get_worker_cache();
-  std::vector<string> worker_names;
-  worker_cache->ListWorkers(&worker_names);
+  const std::vector<string>& worker_names = filtered_worker_list_;
 
   struct WorkerGroup {
     // The worker name. (Not owned.)
diff --git a/tensorflow/core/distributed_runtime/master_session.h b/tensorflow/core/distributed_runtime/master_session.h
index a05419904f..ec34e20b79 100644
--- a/tensorflow/core/distributed_runtime/master_session.h
+++ b/tensorflow/core/distributed_runtime/master_session.h
@@ -52,6 +52,7 @@ class MasterSession : public core::RefCounted {
       std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
       std::unique_ptr<WorkerCacheInterface> worker_cache,
       std::unique_ptr<DeviceSet> device_set,
+      std::vector<string> filtered_worker_list,
       StatsPublisherFactory stats_publisher_factory);
 
   // Initialize the MasterSession for "def".  Must be called before Extend(),
@@ -130,6 +131,10 @@ class MasterSession : public core::RefCounted {
   // The device set used by this session.
   std::unique_ptr<DeviceSet> devices_;
 
+  // The (partial device) names of remote worker tasks that this
+  // session will contact.
+  const std::vector<string> filtered_worker_list_;
+
   StatsPublisherFactory stats_publisher_factory_;
 
   std::atomic_ulong last_access_time_usec_;
@@ -212,7 +217,6 @@ class MasterSession : public core::RefCounted {
   // workers.
   Status CreateWorkerSessions(const WorkerCacheFactoryOptions& server_def);
 
-  // TODO(b/36574172): Always use Create/DeleteWorkerSession.
   bool should_delete_worker_sessions_ = false;
   Status DeleteWorkerSessions();
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index be19103582..488dcde9f5 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -222,10 +222,12 @@ Status GrpcServer::Init(
           SessionOptions options, const MasterEnv* env,
           std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
           std::unique_ptr<WorkerCacheInterface> worker_cache,
-          std::unique_ptr<DeviceSet> device_set) {
+          std::unique_ptr<DeviceSet> device_set,
+          std::vector<string> filtered_worker_list) {
         options.config.MergeFrom(config);
         return new MasterSession(options, env, std::move(remote_devs),
                                  std::move(worker_cache), std::move(device_set),
+                                 std::move(filtered_worker_list),
                                  stats_factory);
       };
   master_env_.worker_cache_factory =
-- 
GitLab


From 4f8768319cfa56c25973cc66d920146ad454bd97 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 18:17:02 -0700
Subject: [PATCH 198/450] Optimize Graph function library.

PiperOrigin-RevId: 193605910
---
 tensorflow/core/grappler/optimizers/BUILD     |   4 +
 .../grappler/optimizers/function_optimizer.cc | 126 ++++++-
 .../grappler/optimizers/function_optimizer.h  |   6 +-
 .../optimizers/function_optimizer_test.cc     |  32 +-
 .../grappler/optimizers/meta_optimizer.cc     | 326 +++++++++++-------
 .../core/grappler/optimizers/meta_optimizer.h |  33 +-
 .../optimizers/meta_optimizer_test.cc         | 172 ++++++++-
 tensorflow/core/grappler/utils/functions.cc   |  12 +-
 tensorflow/core/grappler/utils/functions.h    |  40 ++-
 .../core/grappler/utils/functions_test.cc     |   8 +-
 10 files changed, 563 insertions(+), 196 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index a371186fe6..3ab8d8f584 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -518,11 +518,13 @@ cc_library(
         ":loop_optimizer",
         ":memory_optimizer",
         ":model_pruner",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/utils:colocation",
+        "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
@@ -539,9 +541,11 @@ tf_cuda_cc_test(
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index d008a9719f..950933b933 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -75,12 +76,10 @@ string UniqueSpecializedFunctionName(const FunctionDef& func,
 
 class FunctionOptimizerContext {
  public:
-  explicit FunctionOptimizerContext(const GrapplerItem& item,
-                                    RewriterConfig::Toggle opt_level)
-      : opt_level_(opt_level),
-        function_library_(FunctionLibraryDefinition(OpRegistry::Global(),
-                                                    item.graph.library())) {
-    InitializeInlinedFunctions(item);
+  explicit FunctionOptimizerContext(RewriterConfig::Toggle opt_level,
+                                    const GrapplerItem& item)
+      : function_library_(OpRegistry::Global(), item.graph.library()) {
+    InitializeInlinedFunctions(opt_level, item);
   }
 
   const FunctionLibraryDefinition& function_library() const {
@@ -101,8 +100,9 @@ class FunctionOptimizerContext {
   }
 
  private:
-  void InitializeInlinedFunctions(const GrapplerItem& item) {
-    bool aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
+  void InitializeInlinedFunctions(RewriterConfig::Toggle opt_level,
+                                  const GrapplerItem& item) {
+    bool aggressive = opt_level == RewriterConfig::AGGRESSIVE;
 
     for (const FunctionDef& func : item.graph.library().function()) {
       // Can't create IdentityN nodes with no input or output: skip these
@@ -120,7 +120,6 @@ class FunctionOptimizerContext {
     }
   }
 
-  RewriterConfig::Toggle opt_level_;
   FunctionLibraryDefinition function_library_;
   // Functions that can be inlined into optimized graph.
   std::unordered_map<string, const FunctionDef*> inlined_functions_;
@@ -128,9 +127,93 @@ class FunctionOptimizerContext {
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionOptimizerContext);
 };
 
+// Return trimmed FunctionDefLibrary with functions that are reachable from
+// the optimized graph.
+FunctionDefLibrary TrimFunctionLibrary(const FunctionLibraryDefinition& flib,
+                                       const GraphDef& optimized_graph) {
+  // Functions that are reachable from the optimized graph.
+  std::unordered_set<string> keep_funcs;
+
+  std::vector<const FunctionDef*> func_queue;
+  func_queue.reserve(flib.num_functions());
+
+  // Add registered and not already processed functions to the queue by name.
+  const auto add_to_func_queue = [&](const string& func_name) {
+    const FunctionDef* func = flib.Find(func_name);
+    if (func && keep_funcs.find(func_name) == keep_funcs.end()) {
+      func_queue.push_back(func);
+    }
+  };
+
+  // Find all the functions that are reachable from the given node.
+  const auto add_node_to_func_queue = [&](const NodeDef& node) {
+    // Node itself can be a call to the function.
+    add_to_func_queue(node.op());
+
+    // Or node can have an attribute referencing a function.
+    for (const auto& attr : node.attr()) {
+      const auto& attr_value = attr.second;
+
+      // 1. AttrValue.func
+      if (attr_value.has_func()) {
+        add_to_func_queue(attr_value.func().name());
+      }
+
+      // 2. AttrValue.ListValue.func
+      if (attr_value.has_list()) {
+        for (const auto& func : attr_value.list().func()) {
+          add_to_func_queue(func.name());
+        }
+      }
+    }
+  };
+
+  // Add all functions that are directly called from the optimized graph.
+  const auto& graph_nodes = optimized_graph.node();
+  std::for_each(graph_nodes.begin(), graph_nodes.end(), add_node_to_func_queue);
+
+  // Process all reachable functions.
+  while (!func_queue.empty()) {
+    const FunctionDef* func = func_queue.back();
+    func_queue.pop_back();
+
+    const string& func_name = func->signature().name();
+    keep_funcs.insert(func_name);
+
+    // Find all the functions that called from the function body.
+    const auto& func_body = func->node_def();
+    std::for_each(func_body.begin(), func_body.end(), add_node_to_func_queue);
+
+    // Check if the function has a registered gradient.
+    const string grad_func_name = flib.FindGradient(func_name);
+    if (!grad_func_name.empty()) add_to_func_queue(grad_func_name);
+  }
+
+  FunctionDefLibrary lib;
+  for (const string& func_name : keep_funcs) {
+    const FunctionDef* func = CHECK_NOTNULL(flib.Find(func_name));
+    *lib.add_function() = *func;
+
+    const string grad_func_name = flib.FindGradient(func_name);
+    if (!grad_func_name.empty()) {
+      GradientDef* gd = lib.add_gradient();
+      gd->set_function_name(func_name);
+      gd->set_gradient_func(grad_func_name);
+    }
+  }
+
+  VLOG(3) << "Trimmed function library: " << keep_funcs.size() << " functions ("
+          << static_cast<int>(keep_funcs.size() - flib.num_functions()) << ")";
+
+  return lib;
+}
+
 Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
                           FunctionOptimizerContext* ctx,
                           GraphDef* optimized_graph) {
+  VLOG(2) << "Specialize function instantiation: "
+          << SummarizeNodeDef(func_node);
+
   const std::unordered_map<string, AttrValue> func_attr(
       func_node.attr().begin(), func_node.attr().end());
 
@@ -141,20 +224,20 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
   TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
 
   // TODO(ezhulenev): Push down const inputs and known input shapes.
-  FunctionDef specialized;
-  TF_RETURN_IF_ERROR(MakeSpecializedFunctionDef(item, flib, &specialized));
+  FunctionDef specialized_func;
+  TF_RETURN_IF_ERROR(MakeFunctionDef(item, flib, &specialized_func));
 
   // Find a name for specialized function.
   const string specialized_func_name =
       UniqueSpecializedFunctionName(func, func_node, flib);
 
-  specialized.mutable_signature()->set_name(specialized_func_name);
-  auto* specialized_attr = specialized.mutable_attr();
+  specialized_func.mutable_signature()->set_name(specialized_func_name);
+  auto* specialized_attr = specialized_func.mutable_attr();
   (*specialized_attr)[kGrapplerSpecializedFuncAttr].set_b(true);
 
   // Add specialized function to the library.
   TF_RETURN_IF_ERROR(
-      ctx->mutable_function_library().AddFunctionDef(specialized));
+      ctx->mutable_function_library().AddFunctionDef(specialized_func));
 
   // Add a function call node for the specialized function.
   NodeDef* specialized_func_node = optimized_graph->add_node();
@@ -226,6 +309,8 @@ Status HookInlinedFunctionOutputs(
 Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
                       const FunctionOptimizerContext& ctx,
                       GraphDef* optimized_graph) {
+  VLOG(2) << "Inline function instantiation: " << SummarizeNodeDef(func_node);
+
   const std::unordered_map<string, AttrValue> func_attr(
       func_node.attr().begin(), func_node.attr().end());
 
@@ -359,6 +444,8 @@ class SymbolicGradientEnv {
 
 Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
                               GraphDef* inlined_graph) {
+  VLOG(2) << "Inline symbolic gradient: " << SummarizeNodeDef(node);
+
   GraphDef graph_def;
 
   // Create a node to anchor the gradient inputs
@@ -454,13 +541,16 @@ Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
 
 Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                    GraphDef* optimized_graph) {
+  VLOG(2) << "Optimize function library: id=" << item.id;
+
   // Nothing to do here.
   if (item.graph.library().function_size() == 0) {
+    VLOG(3) << "Skip Grappler item with empty function library";
     *optimized_graph = item.graph;
     return Status::OK();
   }
 
-  FunctionOptimizerContext ctx(item, opt_level_);
+  FunctionOptimizerContext ctx(opt_level_, item);
   SymbolicGradientEnv env(item.graph.versions().producer(),
                           item.graph.library());
 
@@ -506,9 +596,11 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     *optimized_graph->add_node() = node;
   }
 
-  // TODO(bsteiner): trim the library to remove unused function definitions
   *optimized_graph->mutable_versions() = item.graph.versions();
-  *optimized_graph->mutable_library() = ctx.function_library().ToProto();
+  *optimized_graph->mutable_library() =
+      options_.enable_trim_function_library
+          ? TrimFunctionLibrary(ctx.function_library(), *optimized_graph)
+          : ctx.function_library().ToProto();
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.h b/tensorflow/core/grappler/optimizers/function_optimizer.h
index c555fadf83..e307b4e533 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.h
@@ -26,8 +26,9 @@ namespace grappler {
 // operations to make the overall graph more efficient.
 class FunctionOptimizer : public GraphOptimizer {
  public:
-  FunctionOptimizer(RewriterConfig::Toggle opt_level) : opt_level_(opt_level) {}
-  ~FunctionOptimizer() override {}
+  explicit FunctionOptimizer(RewriterConfig::Toggle opt_level)
+      : opt_level_(opt_level) {}
+  ~FunctionOptimizer() override = default;
 
   string name() const override { return "function_optimizer"; };
 
@@ -44,6 +45,7 @@ class FunctionOptimizer : public GraphOptimizer {
     bool enable_function_inlining = true;
     bool enable_function_specialization = true;
     bool enable_symbolic_gradient_inlining = true;
+    bool enable_trim_function_library = true;
   };
 
   RewriterConfig::Toggle opt_level_;
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index fb006d4868..6147e8a27c 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -31,20 +31,8 @@ constexpr char kDevice[] = "/device:CPU:0";
 
 class FunctionOptimizerTest : public GrapplerTest {
  protected:
-  void DisableAll(FunctionOptimizer* optimizer) {
-    optimizer->options_.enable_function_inlining = false;
+  void DisableFunctionSpecialization(FunctionOptimizer* optimizer) {
     optimizer->options_.enable_function_specialization = false;
-    optimizer->options_.enable_symbolic_gradient_inlining = false;
-  }
-
-  void EnableOnlyFunctionInlining(FunctionOptimizer* optimizer) {
-    DisableAll(optimizer);
-    optimizer->options_.enable_function_inlining = true;
-  }
-
-  void EnableOnlyFunctionSpecialization(FunctionOptimizer* optimizer) {
-    DisableAll(optimizer);
-    optimizer->options_.enable_function_specialization = true;
   }
 };
 
@@ -352,7 +340,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithoutInput) {
   using test::function::NDef;
 
   FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
-  EnableOnlyFunctionInlining(&optimizer);
+  DisableFunctionSpecialization(&optimizer);  // do not specialize noinline func
 
   const Tensor kTwo = test::AsScalar<int64>(2);
   FunctionDef func = FunctionDefHelper::Define(
@@ -626,14 +614,13 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) {
   using test::function::NDef;
 
   FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
-  EnableOnlyFunctionSpecialization(&optimizer);
 
-  // Mark XTimesTwo as noinline
+  // Mark XTimesTwo as noinline.
   FunctionDef x_times_two = test::function::XTimesTwo();
   (*x_times_two.mutable_attr())["_noinline"].set_b(true);
   std::vector<FunctionDef> function_library = {x_times_two};
 
-  // Build a graph to compute y = XTimesTwo(x)
+  // Build a graph to compute y = XTimesTwo(x).
   GrapplerItem item;
   item.graph = test::function::GDef(
       {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
@@ -644,12 +631,13 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) {
   GraphDef output;
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  // Make sure that specialized function was added to the library
-  EXPECT_EQ(2, output.library().function_size());
+  // Make sure that specialized function was added to the library and original
+  // function was removed.
+  EXPECT_EQ(1, output.library().function_size());
   EXPECT_EQ("XTimesTwo_specialized_for_y",
-            output.library().function(1).signature().name());
+            output.library().function(0).signature().name());
 
-  // And 'y' node is calling specialized function
+  // And 'y' node is calling specialized function.
   int count = 0;
   for (const NodeDef& node : output.node()) {
     if (node.name() == "y" && count++) {
@@ -658,7 +646,7 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) {
   }
   EXPECT_EQ(1, count);
 
-  // And that graph evaluation yields the same result
+  // And that graph evaluation yields the same result.
   Tensor pi = test::AsScalar<float>(3.14f);
   item.fetch = {"z"};
   item.feed.emplace_back("x", pi);
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 558b8a77e8..22799311bc 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
 #include "tensorflow/core/grappler/utils/colocation.h"
+#include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -36,6 +38,9 @@ namespace tensorflow {
 namespace grappler {
 
 namespace {
+
+constexpr int kDefaultNumberOfIterations = 1;
+
 int64 NumEdges(const GraphDef& graph) {
   int64 num_edges = 0;
   for (const auto& node : graph.node()) {
@@ -50,144 +55,138 @@ string PrintSizesBeforeAfter(const GraphDef& before, const GraphDef& after) {
                          NumEdges(after), " edges (",
                          NumEdges(after) - NumEdges(before), ")");
 }
+
+int NumIterations(const RewriterConfig& cfg) {
+  return cfg.meta_optimizer_iterations() == RewriterConfig::DEFAULT_NUM_ITERS
+             ? kDefaultNumberOfIterations
+             : cfg.meta_optimizer_iterations();
+}
+
+// Check if optimizer is allowed to run only once.
+int IsRunOnceOptimizer(const string& name) { return name == "layout"; }
+
 }  // namespace
 
-std::unique_ptr<GraphOptimizer> MetaOptimizer::NewOptimizer(
-    const string& optimizer) {
-  std::unique_ptr<GraphOptimizer> graph_optimizer;
-  if (optimizer == "pruning") {
-    graph_optimizer.reset(new ModelPruner());
-  }
-  if (optimizer == "function") {
-    graph_optimizer.reset(new FunctionOptimizer(cfg_.function_optimization()));
+std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
+    const string& optimizer) const {
+#define MK_OPT(NAME, VALUE) \
+  if (optimizer == NAME) return std::unique_ptr<GraphOptimizer>(VALUE)
+
+  MK_OPT("pruning", new ModelPruner());
+  MK_OPT("function", new FunctionOptimizer(cfg_.function_optimization()));
+  MK_OPT("constfold", new ConstantFolding(cpu_device_));
+  MK_OPT("layout", new LayoutOptimizer());
+  MK_OPT("memory", new MemoryOptimizer(RewriterConfig::MANUAL));
+  MK_OPT("arithmetic", new ArithmeticOptimizer(cfg_.arithmetic_optimization()));
+  MK_OPT("autoparallel", new AutoParallel(cfg_.auto_parallel().num_replicas()));
+  MK_OPT("loop", new LoopOptimizer(cfg_.loop_optimization()));
+  MK_OPT("dependency", new DependencyOptimizer(cfg_.dependency_optimization()));
+  MK_OPT("debug_stripper", new DebugStripper());
+
+  return std::unique_ptr<GraphOptimizer>();
+#undef MK_OPT
+}
+
+Status MetaOptimizer::InitializeOptimizers(
+    std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const {
+  if (!cfg_.disable_model_pruning()) {
+    optimizers->emplace_back(new ModelPruner());
   }
-  if (optimizer == "constfold") {
-    graph_optimizer.reset(new ConstantFolding(cpu_device_));
+  if (cfg_.function_optimization() != RewriterConfig::OFF) {
+    optimizers->emplace_back(
+        new FunctionOptimizer(cfg_.function_optimization()));
   }
-  if (optimizer == "layout") {
-    graph_optimizer.reset(new LayoutOptimizer());
+  if (cfg_.debug_stripper() == RewriterConfig::ON) {
+    optimizers->emplace_back(new DebugStripper());
   }
-  if (optimizer == "memory") {
-    graph_optimizer.reset(new MemoryOptimizer(RewriterConfig::MANUAL));
+  if (cfg_.constant_folding() != RewriterConfig::OFF) {
+    optimizers->emplace_back(
+        new ConstantFolding(cfg_.constant_folding(), cpu_device_));
   }
-  if (optimizer == "arithmetic") {
-    graph_optimizer.reset(
+  if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
+    optimizers->emplace_back(
         new ArithmeticOptimizer(cfg_.arithmetic_optimization()));
   }
-  if (optimizer == "autoparallel") {
-    graph_optimizer.reset(
-        new AutoParallel(cfg_.auto_parallel().num_replicas()));
-  }
-  if (optimizer == "loop") {
-    graph_optimizer.reset(new LoopOptimizer(cfg_.loop_optimization()));
+  if (cfg_.loop_optimization() != RewriterConfig::OFF) {
+    optimizers->emplace_back(new LoopOptimizer(cfg_.loop_optimization()));
   }
-  if (optimizer == "dependency") {
-    graph_optimizer.reset(
+  if (cfg_.dependency_optimization() != RewriterConfig::OFF) {
+    optimizers->emplace_back(
         new DependencyOptimizer(cfg_.dependency_optimization()));
   }
-  if (optimizer == "debug_stripper") {
-    graph_optimizer.reset(new DebugStripper());
+  if (cfg_.layout_optimizer() != RewriterConfig::OFF) {
+    optimizers->emplace_back(new LayoutOptimizer());
+  }
+  if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) {
+    if (cfg_.memory_optimizer_target_node_name_scope().empty()) {
+      optimizers->emplace_back(
+          // Use the default target node name prefix "gradients/"
+          new MemoryOptimizer(cfg_.memory_optimization()));
+    } else {
+      optimizers->emplace_back(
+          new MemoryOptimizer(cfg_.memory_optimization(),
+                              cfg_.memory_optimizer_target_node_name_scope()));
+    }
+  }
+  if (cfg_.auto_parallel().enable()) {
+    optimizers->emplace_back(
+        new AutoParallel(cfg_.auto_parallel().num_replicas()));
   }
-  return graph_optimizer;
+  return Status::OK();
 }
 
-Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
-                               GraphDef* optimized_graph) {
-  std::vector<std::unique_ptr<GraphOptimizer>> optimizers;
-  if (cfg_.optimizers().empty()) {
-    if (!cfg_.disable_model_pruning()) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(new ModelPruner()));
-    }
-    if (cfg_.function_optimization() != RewriterConfig::OFF) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new FunctionOptimizer(cfg_.function_optimization())));
-    }
-    if (cfg_.debug_stripper() == RewriterConfig::ON) {
-      optimizers.push_back(
-          std::unique_ptr<GraphOptimizer>(new DebugStripper()));
-    }
-    if (cfg_.constant_folding() != RewriterConfig::OFF) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new ConstantFolding(cfg_.constant_folding(), cpu_device_)));
-    }
-    if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new ArithmeticOptimizer(cfg_.arithmetic_optimization())));
+Status MetaOptimizer::InitializeOptimizersByName(
+    std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const {
+  for (const string& optimizer_name : cfg_.optimizers()) {
+    auto optimizer = MakeNewOptimizer(optimizer_name);
+    if (optimizer) {
+      VLOG(2) << "Registered default graph optimizer: " << optimizer_name;
+      optimizers->push_back(std::move(optimizer));
+      continue;
     }
-    if (cfg_.loop_optimization() != RewriterConfig::OFF) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new LoopOptimizer(cfg_.loop_optimization())));
-    }
-    if (cfg_.dependency_optimization() != RewriterConfig::OFF) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new DependencyOptimizer(cfg_.dependency_optimization())));
-    }
-    if (cfg_.layout_optimizer() != RewriterConfig::OFF) {
-      optimizers.push_back(
-          std::unique_ptr<GraphOptimizer>(new LayoutOptimizer()));
-    }
-    if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) {
-      if (cfg_.memory_optimizer_target_node_name_scope().empty()) {
-        optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-            // Use the default target node name prefix "gradients/"
-            new MemoryOptimizer(cfg_.memory_optimization())));
-      } else {
-        optimizers.push_back(
-            std::unique_ptr<GraphOptimizer>(new MemoryOptimizer(
-                cfg_.memory_optimization(),
-                cfg_.memory_optimizer_target_node_name_scope())));
-      }
-    }
-    if (cfg_.auto_parallel().enable()) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new AutoParallel(cfg_.auto_parallel().num_replicas())));
-    }
-  } else {
-    const std::set<string> available_optimizers = {
-        "pruning",    "function",      "constfold",  "layout",
-        "memory",     "autoparallel",  "arithmetic", "loop",
-        "dependency", "debug_stripper"};
-    std::vector<string> custom_optimizer_names;
-    for (const auto& optimizer_name : cfg_.optimizers()) {
-      if (available_optimizers.find(optimizer_name) !=
-          available_optimizers.end()) {
-        optimizers.push_back(NewOptimizer(optimizer_name));
-      } else {
-        custom_optimizer_names.push_back(optimizer_name);
-      }
-    }
-    // Now run the custom optimizers.
-    for (const auto& optimizer_name : custom_optimizer_names) {
-      std::unique_ptr<CustomGraphOptimizer> opt =
-          CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer_name);
-      if (opt == nullptr) continue;
-      TF_RETURN_IF_ERROR(opt->Init());
-      optimizers.push_back(std::move(opt));
+
+    auto custom_optimizer =
+        CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer_name);
+
+    if (custom_optimizer) {
+      VLOG(2) << "Registered custom graph optimizer: " << optimizer_name;
+      TF_RETURN_IF_ERROR(custom_optimizer->Init());
+      optimizers->push_back(std::move(custom_optimizer));
+    } else {
+      VLOG(2) << "Can't register an optimizer by name: " << optimizer_name;
     }
   }
+  return Status::OK();
+}
+
+Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
+                                    GraphDef* optimized_graph) {
+  VLOG(2) << "Optimize GrapplerItem: item.id=" << item.id;
+
+  std::vector<std::unique_ptr<GraphOptimizer>> optimizers;
+  bool register_by_name = !cfg_.optimizers().empty();
+  TF_RETURN_IF_ERROR(register_by_name ? InitializeOptimizersByName(&optimizers)
+                                      : InitializeOptimizers(&optimizers));
 
   if (optimizers.empty()) {
     *optimized_graph = item.graph;
     return Status::OK();
   }
 
-  // Some optimizers should be run only once.
-  const std::set<string> run_once_optimizers = {"layout"};
-  bool already_optimized = false;
-  const int num_iterations =
-      cfg_.meta_optimizer_iterations() == RewriterConfig::DEFAULT_NUM_ITERS
-          ? 1
-          : cfg_.meta_optimizer_iterations();
+  // Invariant: optimized_graph contains the most recently optimized version of
+  // the graph.
   GrapplerItem optimized_item = item;
   optimized_graph->Swap(&optimized_item.graph);
-  for (int iteration = 0; iteration < num_iterations; ++iteration) {
-    VLOG(1) << "Starting optimization iteration " << iteration + 1;
+
+  GraphOptimizationResult optimization_result(item.id);
+
+  for (int iteration = 0; iteration < NumIterations(cfg_); ++iteration) {
+    VLOG(4) << "Starting optimization iteration " << iteration + 1;
+
     for (const auto& optimizer : optimizers) {
-      // Invariant: optimized_graph contains the most recently optimized
-      // version of the graph.
-      if (iteration > 0 && run_once_optimizers.count(optimizer->name())) {
-        continue;
-      }
+      // Some optimizers can run only once.
+      if (iteration > 0 && IsRunOnceOptimizer(optimizer->name())) continue;
+
       uint64 start_us = Env::Default()->NowMicros();
       // This swaps the current optimized_graph into optimized item and
       // resets optimized_graph to an empty graph.
@@ -195,45 +194,114 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       *optimized_graph = GraphDef();
       Status status =
           optimizer->Optimize(cluster, optimized_item, optimized_graph);
-
       uint64 end_us = Env::Default()->NowMicros();
-      float duration_ms = (end_us - start_us) / 1000.0f;
+
       string result;
       if (!status.ok()) {
-        VLOG(1) << "Not able to apply optimizer " << optimizer->name() << ": "
-                << status.ToString();
         optimized_graph->Swap(&optimized_item.graph);
         result = status.ToString();
       } else {
-        already_optimized = true;
+        optimization_result.is_optimized = true;
+        float duration_ms = (end_us - start_us) / 1000.0f;
         result = strings::StrCat(
-            optimizer->name(), ": ",
             PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph),
             ", time = ", duration_ms, "ms.");
       }
-      result_.emplace_back(optimizer->name(), result);
-      VLOG(1) << result;
+      VLOG(4) << optimizer->name() << ": " << result;
+
+      OptimizerResult optimizer_result{optimizer->name(), result};
+      optimization_result.results.push_back(optimizer_result);
     }
   }
 
-  if (already_optimized) {
+  // Record graph optimization result.
+  optimization_results_.push_back(optimization_result);
+
+  if (optimization_result.is_optimized) {
     TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph));
     ReassignColocation(optimized_graph);
-    // Make sure that the optimizers preserved the graph version and library.
-    DCHECK_GE(optimized_graph->library().function_size(),
-              item.graph.library().function_size());
-    DCHECK_GE(optimized_graph->library().gradient_size(),
-              item.graph.library().gradient_size());
+    // Make sure that the optimizers preserved the graph version.
     DCHECK_EQ(optimized_graph->versions().producer(),
               item.graph.versions().producer());
   }
+
+  return Status::OK();
+}
+
+Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                               GraphDef* optimized_graph) {
+  optimization_results_.clear();
+
+  // 1. Optimize main graph
+  TF_RETURN_IF_ERROR(OptimizeGraph(cluster, item, optimized_graph));
+
+  // 2. Optimize function library
+  FunctionLibraryDefinition flib(OpRegistry::Global(),
+                                 optimized_graph->library());
+
+  // Optimize each function only once.
+  std::unordered_set<string> optimized_funcs;
+  bool optimize_function_library = true;
+
+  while (optimize_function_library) {
+    optimize_function_library = false;
+
+    for (const FunctionDef& func : optimized_graph->library().function()) {
+      const string& func_name = func.signature().name();
+
+      // Skip already optimized functions.
+      if (optimized_funcs.find(func_name) != optimized_funcs.end()) continue;
+
+      // Skip parametrized functions (function type or body is defined only at
+      // function call time by caller node attributes).
+      if (IsParametrized(func)) continue;
+
+      VLOG(3) << "Optimize function: function=" << func_name;
+
+      // Function optimization might specialize nested function calls, so we
+      // have to reset the flag and do at least one more pass over the library.
+      optimize_function_library = true;
+      optimized_funcs.insert(func_name);
+
+      // Make a GrapplerItem from a FunctionDef.
+      GrapplerFunctionItem func_item;
+      TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, flib, &func_item));
+
+      // Optimize function body graph.
+      GraphDef optimized_func_graph;
+      TF_RETURN_IF_ERROR(
+          OptimizeGraph(cluster, func_item, &optimized_func_graph));
+
+      // Function body optimization might have created new specialized
+      // functions, add them to the library.
+      TF_RETURN_IF_ERROR(flib.AddLibrary(optimized_func_graph.library()));
+
+      // Convert optimized graph back to FunctionDef.
+      FunctionDef optimized_func;
+      func_item.SwapFunctionBody(std::move(optimized_func_graph));
+      TF_RETURN_IF_ERROR(MakeFunctionDef(func_item, flib, &optimized_func));
+
+      // Replace optimized function with a new FunctionDef.
+      TF_RETURN_IF_ERROR(flib.RemoveFunction(func_name));
+      TF_RETURN_IF_ERROR(flib.AddFunctionDef(optimized_func));
+    }
+
+    // If optimized at least one function, update the graph library.
+    if (optimize_function_library) {
+      *optimized_graph->mutable_library() = flib.ToProto();
+    }
+  }
+
   return Status::OK();
 }
 
 void MetaOptimizer::PrintResult() {
-  for (const auto& result : result_) {
-    LOG(INFO) << "Return status of optimizer " << result.first << ": "
-              << result.second;
+  for (const GraphOptimizationResult& graph_result : optimization_results_) {
+    LOG(INFO) << "Optimization results for grappler item: " << graph_result.id;
+    for (const OptimizerResult& result : graph_result.results) {
+      LOG(INFO) << "Return status of optimizer " << result.optimizer_name
+                << ": " << result.result;
+    }
   }
 }
 
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index 382cfe51d4..7cf9a40c2d 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -30,7 +30,7 @@ class MetaOptimizer : public GraphOptimizer {
  public:
   MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg)
       : cpu_device_(cpu_device), cfg_(cfg) {}
-  ~MetaOptimizer() override {}
+  ~MetaOptimizer() override = default;
 
   string name() const override { return "meta_optimizer"; };
 
@@ -43,10 +43,37 @@ class MetaOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override;
 
  private:
-  std::unique_ptr<GraphOptimizer> NewOptimizer(const string& optimizer);
+  std::unique_ptr<GraphOptimizer> MakeNewOptimizer(
+      const string& optimizer) const;
+
+  // Initialize active optimizers from RewriterConfig toggles.
+  Status InitializeOptimizers(
+      std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const;
+  // Initialize active optimizers from RewriterConfig optimizer names.
+  Status InitializeOptimizersByName(
+      std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const;
+
+  // Run optimization pass over a single GrapplerItem. Meta optimizer might run
+  // multiple such passes: 1) for the main graph 2) for the function library
+  Status OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
+                       GraphDef* optimized_graph);
+
   DeviceBase* const cpu_device_;  // may be NULL
   RewriterConfig cfg_;
-  std::vector<std::pair<string, string>> result_;
+
+  struct OptimizerResult {
+    string optimizer_name;
+    string result;
+  };
+
+  struct GraphOptimizationResult {
+    explicit GraphOptimizationResult(const string& id) : id(id) {}
+    string id;
+    bool is_optimized = false;
+    std::vector<OptimizerResult> results;
+  };
+
+  std::vector<GraphOptimizationResult> optimization_results_;
 };
 
 bool MetaOptimizerEnabled(const RewriterConfig& cfg);
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index d9a386b9be..8793ad9633 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -16,11 +16,14 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -28,6 +31,8 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+constexpr char kDevice[] = "/device:CPU:0";
+
 class TestOptimizer : public CustomGraphOptimizer {
  public:
   static void SetOptimized(const bool flag_value) { optimized_ = flag_value; }
@@ -56,7 +61,9 @@ bool TestOptimizer::optimized_;
 
 REGISTER_GRAPH_OPTIMIZER(TestOptimizer);
 
-TEST(MetaOptimizerTest, RunsCustomOptimizer) {
+class MetaOptimizerTest : public GrapplerTest {};
+
+TEST_F(MetaOptimizerTest, RunsCustomOptimizer) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
@@ -72,7 +79,7 @@ TEST(MetaOptimizerTest, RunsCustomOptimizer) {
   EXPECT_TRUE(TestOptimizer::IsOptimized());
 }
 
-TEST(MetaOptimizerTest, RunOptimizersTwice) {
+TEST_F(MetaOptimizerTest, RunOptimizersTwice) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
@@ -86,6 +93,167 @@ TEST(MetaOptimizerTest, RunOptimizersTwice) {
   TF_EXPECT_OK(status);
 }
 
+TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
+  using test::function::NDef;
+
+  // Enable ony function optimization.
+  RewriterConfig rewriter_config;
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
+  rewriter_config.set_function_optimization(RewriterConfig::ON);
+  rewriter_config.add_optimizers("function");
+
+  MetaOptimizer optimizer(nullptr, rewriter_config);
+
+  // Define function library:
+  //
+  //   MyMul(x, y)    = x * y
+  //  *MySquare(x)    = MyMul(x, x)
+  //  *MyQuadratic(x) = MySquare(MySquare(x))
+  //
+  //  * - marked as noinline
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  FunctionDef square_func = FunctionDefHelper::Create(
+      "MySquare", {"x:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"my_mul"}, "MyMul", {"x", "x"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "my_mul:z:0"}});
+  (*square_func.mutable_attr())["_noinline"].set_b(true);
+
+  FunctionDef quadratic_func = FunctionDefHelper::Create(
+      "MyQuadratic", {"x:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"square"}, "MySquare", {"x"}, {{"T", "$T"}}},
+       {{"quadratic"}, "MySquare", {"square:z"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "quadratic:z:0"}});
+  (*quadratic_func.mutable_attr())["_noinline"].set_b(true);
+
+  // Tensorflow graph:
+  //
+  //   a = tf.Placeholder(tf.float);
+  //   b = tf.Placeholder(tf.int32);
+  //
+  //   square = MySquare(a);        // a^2
+  //   quadratic = MyQuadratic(b);  // b^4
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_INT32}}, kDevice),
+       // Calls into function library
+       NDef("square", "MySquare", {"a"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("quadratic", "MyQuadratic", {"b"}, {{"T", DT_INT32}}, kDevice),
+       // Forward outputs
+       NDef("out_s", "Identity", {"square:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("out_q", "Identity", {"quadratic:0"}, {{"T", DT_INT32}}, kDevice)},
+      // FunctionLib
+      {mul_func, square_func, quadratic_func});
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  FunctionLibraryDefinition optimized_flib(OpRegistry::Global(),
+                                           output.library());
+
+  // Specialized and optimized functions should be added to the graph.
+  EXPECT_EQ(6, optimized_flib.num_functions());
+
+  // MyQuadratic should be specialized once:
+  //   0. 'quadratic' node in the main graph
+  const string optimized_0 = "MyQuadratic_specialized_for_quadratic";
+
+  // MySquare should be specialized and optimized for 3 instantiations:
+  //   1. 'square' node in the main graph
+  //   2. 'square' node in the MyQuadratic specialization
+  //   3. 'quadratic' node in the MyQuadratic specialization
+
+  const string optimized_1 = "MySquare_specialized_for_square";
+  const string optimized_2 = "MySquare_specialized_for_square_1";
+  const string optimized_3 = "MySquare_specialized_for_quadratic";
+
+  const FunctionDef* optimized_func_0 = optimized_flib.Find(optimized_0);
+  const FunctionDef* optimized_func_1 = optimized_flib.Find(optimized_1);
+  const FunctionDef* optimized_func_2 = optimized_flib.Find(optimized_2);
+  const FunctionDef* optimized_func_3 = optimized_flib.Find(optimized_3);
+
+  ASSERT_NE(optimized_func_0, nullptr);
+  ASSERT_NE(optimized_func_1, nullptr);
+  ASSERT_NE(optimized_func_2, nullptr);
+  ASSERT_NE(optimized_func_3, nullptr);
+
+  // Graph should call optimized function.
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "square" && count++) {
+      EXPECT_EQ("MySquare_specialized_for_square", node.op());
+    } else if (node.name() == "quadratic" && count++) {
+      EXPECT_EQ("MyQuadratic_specialized_for_quadratic", node.op());
+    }
+  }
+  EXPECT_EQ(2, count);
+
+  // Specialized MySquare should call specialized functions.
+  count = 0;
+  for (const NodeDef& node : optimized_func_0->node_def()) {
+    if (node.name() == "square" && count++) {
+      EXPECT_EQ(optimized_2, node.op());
+    } else if (node.name() == "quadratic" && count++) {
+      EXPECT_EQ(optimized_3, node.op());
+    }
+  }
+  EXPECT_EQ(2, count);
+
+  const std::vector<const FunctionDef*> optimized_funcs = {
+      optimized_func_1, optimized_func_1, optimized_func_3};
+
+  // MyMul should be inlined into all optimized versions of MySquare.
+  for (const FunctionDef* optimized_func : optimized_funcs) {
+    count = 0;
+    for (const NodeDef& node : optimized_func->node_def()) {
+      if (node.name() == "my_mul/inlined_inputs" && count++) {
+        EXPECT_EQ("IdentityN", node.op());
+        EXPECT_EQ(2, node.input_size());
+        EXPECT_EQ("x:0", node.input(0));
+        EXPECT_EQ("x:0", node.input(1));
+      } else if (node.name() == "my_mul/x" && count++) {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ(1, node.input_size());
+        EXPECT_EQ("my_mul/inlined_inputs:output:0", node.input(0));
+      } else if (node.name() == "my_mul/y" && count++) {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ(1, node.input_size());
+        EXPECT_EQ("my_mul/inlined_inputs:output:1", node.input(0));
+      } else if (node.name() == "my_mul/mul" && count++) {
+        EXPECT_EQ("Mul", node.op());
+        EXPECT_EQ(2, node.input_size());
+        EXPECT_EQ("my_mul/x:output:0", node.input(0));
+        EXPECT_EQ("my_mul/y:output:0", node.input(1));
+      } else if (node.name() == "my_mul" && count++) {
+        EXPECT_EQ("IdentityN", node.op());
+        EXPECT_EQ(1, node.input_size());
+        EXPECT_EQ("my_mul/mul:z:0", node.input(0));
+      }
+      EXPECT_TRUE(node.device().empty());
+    }
+    EXPECT_EQ(5, count);
+  }
+
+  item.fetch = {"out_s", "out_q"};
+  item.feed.emplace_back("a", test::AsScalar<float>(2.0f));
+  item.feed.emplace_back("b", test::AsScalar<int>(4));
+  auto tensors_expected = EvaluateFetchNodes(item);
+
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorEqual<int>(tensors_expected[1], tensors[1]);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index 638fe1999a..790809bc67 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -545,6 +545,12 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
   return Status::OK();
 }
 
+Status MakeGrapplerFunctionItem(const FunctionDef& func,
+                                const FunctionLibraryDefinition& flib,
+                                GrapplerFunctionItem* item) {
+  return MakeGrapplerFunctionItem(func, AttrValueMap(), flib, item);
+}
+
 // Register GrapplerFunctionItem input arg expansion and function body outputs
 // in the GrapplerFunctionConnectivity.
 Status RegisterGrapplerFunctionConnectivity(
@@ -560,9 +566,9 @@ Status RegisterGrapplerFunctionConnectivity(
   return Status::OK();
 }
 
-Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item,
-                                  const FunctionLibraryDefinition& flib,
-                                  FunctionDef* func) {
+Status MakeFunctionDef(const GrapplerFunctionItem& item,
+                       const FunctionLibraryDefinition& flib,
+                       FunctionDef* func) {
   func->mutable_signature()->set_name(item.id);
   func->mutable_signature()->set_is_stateful(item.is_stateful());
 
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index ab369bcad7..5e8b6c6960 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -38,7 +38,8 @@ using AttrValueMap = std::unordered_map<string, AttrValue>;
 // function body in place of function inputs and a resolved input data type.
 struct InputArgExpansion {
   // TODO(ezhulenev): Add support for functions with tensor sequence inputs of
-  // different data types
+  // different data types.
+  // TODO(ezhulenev): Support type parametrized inputs?
   string input_name;                 // name of the function input argument
   DataType data_type;                // input data type
   bool is_ref;                       // if true, inputs are required to be refs
@@ -53,7 +54,8 @@ struct InputArgExpansion {
 // tensors of a function body nodes and a resolved output data type
 struct OutputArgExpansion {
   // TODO(ezhulenev): Add support for functions with tensor sequence outputs of
-  // different data types
+  // different data types.
+  // TODO(ezhulenev): Support type parametrized outputs?
   string output_name;                  // name of the function output argument
   DataType data_type;                  // output data type
   bool is_ref;                         // if true, outputs are refs
@@ -186,13 +188,6 @@ bool HasParametrizedBody(const FunctionDef& func);
 // Check if function has parametrized type or body.
 bool IsParametrized(const FunctionDef& func);
 
-// Make a GrapplerFunctionItem from the function definition and attributes.
-// Return error if the given function def cannot be converted.
-Status MakeGrapplerFunctionItem(
-    const FunctionDef& func,
-    const std::unordered_map<string, AttrValue>& func_instantiation_attr,
-    const FunctionLibraryDefinition& flib, GrapplerFunctionItem* item);
-
 // Register GrapplerFunctionItem input arg expansion and function body outputs
 // in the GrapplerFunctionConnectivity.  Use function library definition to
 // lookup function body nodes output names and ranges.
@@ -200,11 +195,28 @@ Status RegisterGrapplerFunctionConnectivity(
     const GrapplerFunctionItem& item, const FunctionLibraryDefinition& flib,
     GrapplerFunctionConnectivity* connectivity);
 
-// Make a specialized FunctionDef from the GrapplerFunctionItem. Use function
-// library definition to lookup function body nodes output names and ranges.
-Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item,
-                                  const FunctionLibraryDefinition& flib,
-                                  FunctionDef* func);
+// Make a GrapplerFunctionItem from the function definition and function
+// instantiation attributes (caller node attributes). Returns error if the given
+// function def cannot be converted (e.g. not all attributes are defined).
+Status MakeGrapplerFunctionItem(
+    const FunctionDef& func,
+    const std::unordered_map<string, AttrValue>& func_instantiation_attr,
+    const FunctionLibraryDefinition& flib, GrapplerFunctionItem* item);
+
+// Make a GrapplerFunction item from the function definition. Function must be
+// fully defined (no type or body parametrization).
+// TODO(ezhulenev): Support parametrized functions without fully defined
+// instantiation attributes? Do we ever want to optimize parametrized function
+// without specializing it to it's instantiation attributes (at least types)?
+Status MakeGrapplerFunctionItem(const FunctionDef& func,
+                                const FunctionLibraryDefinition& flib,
+                                GrapplerFunctionItem* item);
+
+// Make a FunctionDef from the GrapplerFunctionItem. Use function library
+// definition to lookup function body nodes output names and ranges.
+Status MakeFunctionDef(const GrapplerFunctionItem& item,
+                       const FunctionLibraryDefinition& flib,
+                       FunctionDef* func);
 
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 54d235a8a4..6dfd49b943 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -524,7 +524,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
   EXPECT_EQ("two", cast.input(0));
 }
 
-TEST_F(FunctionsTest, MakeSpecializedFunctionDef) {
+TEST_F(FunctionsTest, MakeFunctionDef) {
   const Tensor kTwo = test::AsScalar<int64>(2);
   FunctionDef func = FunctionDefHelper::Define(
       // Name
@@ -550,7 +550,7 @@ TEST_F(FunctionsTest, MakeSpecializedFunctionDef) {
   TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
 
   FunctionDef specialized;
-  TF_EXPECT_OK(MakeSpecializedFunctionDef(item, flib, &specialized));
+  TF_EXPECT_OK(MakeFunctionDef(item, flib, &specialized));
 
   // Input and output types are resolved based on instantiation attributes.
   EXPECT_EQ("x", specialized.signature().input_arg(0).name());
@@ -573,7 +573,7 @@ TEST_F(FunctionsTest, MakeSpecializedFunctionDef) {
   EXPECT_EQ(2, count);
 }
 
-TEST_F(FunctionsTest, SwapFunctionBodyAndMakeSpecializedFunctionDef) {
+TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) {
   using test::function::NDef;
 
   FunctionDef mul_func = FunctionDefHelper::Create(
@@ -606,7 +606,7 @@ TEST_F(FunctionsTest, SwapFunctionBodyAndMakeSpecializedFunctionDef) {
   // Replace function body with identity function
   item.SwapFunctionBody(std::move(id_func_body));
   FunctionDef specialized;
-  TF_EXPECT_OK(MakeSpecializedFunctionDef(item, flib, &specialized));
+  TF_EXPECT_OK(MakeFunctionDef(item, flib, &specialized));
 
   // Check that graph body was updated.
   int count = 0;
-- 
GitLab


From 256aad5324d163c028da0dc0318c3e00cf2fc3ab Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 18:29:00 -0700
Subject: [PATCH 199/450] [XLA] Fix a bug in the name_uniquer.

The problem happens because the name_uniquer stripped away the numeric suffix if it <=0. The solution is, if there was a numeric suffix, the result should also have a numeric suffix.

PiperOrigin-RevId: 193606838
---
 tensorflow/compiler/xla/service/name_uniquer.cc      | 11 ++++++-----
 tensorflow/compiler/xla/service/name_uniquer_test.cc | 11 +++++++++--
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc
index 7d8c05fffa..f74bcb0b79 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer.cc
@@ -53,17 +53,18 @@ NameUniquer::NameUniquer(const string& separator) {
 }
 
 string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) {
-  string root = prefix.empty() ? "name" : prefix.ToString();
-  root = GetSanitizedName(root);
+  string root = GetSanitizedName(prefix.empty() ? "name" : prefix.ToString());
 
   // Strip away numeric suffix (if any). Only recognize separator if it is in
   // the middle of the name.
+  bool has_numeric_suffix = false;
+  int64 numeric_suffix = 0;
   size_t separator_index = root.rfind(separator_);
   if (separator_index != string::npos && (separator_index > 0) &&
       (separator_index < root.size() - 1)) {
     string after_suffix = root.substr(separator_index + 1);
-    int64 numeric_suffix;
     if (tensorflow::strings::safe_strto64(after_suffix, &numeric_suffix)) {
+      has_numeric_suffix = true;
       // Remove numeric suffix from root.
       root = root.substr(0, separator_index);
       // Update count to at least the numeric suffix value to avoid future
@@ -71,11 +72,11 @@ string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) {
       generated_names_[root] = std::max(generated_names_[root], numeric_suffix);
     }
   }
-
   int64* count = &(generated_names_[root]);
   if (*count == 0) {
     *count = 1;
-    return root;
+    return has_numeric_suffix ? tensorflow::strings::StrCat(root, separator_, 0)
+                              : root;
   } else {
     tensorflow::strings::StrAppend(&root, separator_, *count);
     // Increment lookup under old 'root' name.
diff --git a/tensorflow/compiler/xla/service/name_uniquer_test.cc b/tensorflow/compiler/xla/service/name_uniquer_test.cc
index 4258cf1687..2ec255558c 100644
--- a/tensorflow/compiler/xla/service/name_uniquer_test.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer_test.cc
@@ -57,11 +57,18 @@ TEST_F(NameUniquerTest, NumericSuffixes) {
   EXPECT_EQ("foo.55", uniquer.GetUniqueName("foo"));
   EXPECT_EQ("foo.55.1", uniquer.GetUniqueName("foo.55.1"));
   EXPECT_EQ("foo.55.2", uniquer.GetUniqueName("foo.55.1"));
-  EXPECT_EQ("bar", uniquer.GetUniqueName("bar.-1000"));
+  EXPECT_EQ("bar.0", uniquer.GetUniqueName("bar.-1000"));
   EXPECT_EQ("bar.1", uniquer.GetUniqueName("bar.-2000"));
   EXPECT_EQ("bar.2", uniquer.GetUniqueName("bar.1"));
 }
 
+TEST_F(NameUniquerTest, PrefixHasSuffix) {
+  NameUniquer uniquer(".");
+
+  EXPECT_EQ("foo.11.0", uniquer.GetUniqueName("foo.11.0"));
+  EXPECT_EQ("foo.11", uniquer.GetUniqueName("foo.11"));
+}
+
 TEST_F(NameUniquerTest, Sanitize) {
   NameUniquer uniquer("_");
 
@@ -73,7 +80,7 @@ TEST_F(NameUniquerTest, Sanitize) {
   EXPECT_EQ("foo_55", uniquer.GetUniqueName("foo"));
 
   // Invalid characters will be replaced with '_'.
-  EXPECT_EQ("bar", uniquer.GetUniqueName("bar<-1000"));
+  EXPECT_EQ("bar_0", uniquer.GetUniqueName("bar<-1000"));
   EXPECT_EQ("bar_1", uniquer.GetUniqueName("bar<-2000"));
   EXPECT_EQ("bar_2", uniquer.GetUniqueName("bar_1"));
 
-- 
GitLab


From 052c3863cf8b901303a1a32e82b6525dc6ea6dbd Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 19 Apr 2018 18:45:47 -0700
Subject: [PATCH 200/450] Internal change.

PiperOrigin-RevId: 193608140
---
 tensorflow/compiler/xla/python/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 0b9333b406..ecb87bd889 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -8,7 +8,6 @@ py_library(
     name = "xla_client",
     srcs = ["xla_client.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_oss"],
     visibility = ["//visibility:public"],
     deps = [
         ":pywrap_xla",
@@ -21,6 +20,7 @@ py_test(
     srcs = ["xla_client_test.py"],
     main = "xla_client_test.py",
     srcs_version = "PY2AND3",
+    tags = ["no_oss"],
     deps = [
         ":xla_client",
         "//tensorflow/python:platform_test",
-- 
GitLab


From 6e2df5e471295cd32f9887d76e6ddbf1b4e2a11a Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 19 Apr 2018 19:03:03 -0700
Subject: [PATCH 201/450] Automated g4 rollback of changelist 193593761

PiperOrigin-RevId: 193609407
---
 tensorflow/compiler/xla/service/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index d5d09bd8a3..9009cbf845 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -699,7 +699,6 @@ cc_library(
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu:cpu_transfer_manager",
         "//tensorflow/core:stream_executor_no_cuda",
-        "//tensorflow/stream_executor:stream_executor_impl",
     ],
 )
 
-- 
GitLab


From b001827146ff95c9e0ce5668c85d8cc2daf6b78d Mon Sep 17 00:00:00 2001
From: Igor Saprykin <isaprykin@google.com>
Date: Thu, 19 Apr 2018 19:11:37 -0700
Subject: [PATCH 202/450] Support variable parameter structure in TPU
 distribution strategy.

TPUStrategy is added to a few more tests.

There appears to be an issue with the batch norm test in minimize_loss_test where the moving averages stay at 0.  I'm trying to resolve that separately as the next CL.

PiperOrigin-RevId: 193610264
---
 tensorflow/contrib/distribute/python/BUILD    | 18 +++--
 .../distribute/python/minimize_loss_test.py   | 19 ++++-
 .../distribute/python/single_loss_example.py  |  7 +-
 .../contrib/distribute/python/tpu_strategy.py | 70 +++++++++++--------
 .../contrib/distribute/python/values.py       | 34 +++++++--
 5 files changed, 104 insertions(+), 44 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 837a1f1348..c2834d8226 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -231,15 +231,14 @@ py_library(
     srcs = ["tpu_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/contrib/distribute/python:one_device_strategy",
-        "//tensorflow/contrib/eager/python:datasets",
-        "//tensorflow/contrib/optimizer_v2:training",
+        ":one_device_strategy",
+        ":values",
         "//tensorflow/contrib/tpu",
-        "//tensorflow/python:array_ops",
+        "//tensorflow/contrib/tpu:tpu_py",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python/eager:context",
-        "@six_archive//:six",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -249,9 +248,13 @@ py_library(
     srcs = ["minimize_loss_test.py"],
     deps = [
         ":combinations",
+        ":mirrored_strategy",
         ":single_loss_example",
+        "//tensorflow/contrib/tpu:tpu_lib",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
@@ -324,6 +327,7 @@ py_library(
     srcs = ["single_loss_example.py"],
     deps = [
         ":step_fn",
+        "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:layers",
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index 43b2e91cbf..e134fe34e1 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -96,8 +96,17 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       combinations.times(
           combinations.distributions_and_v1_optimizers() +
           combinations.distributions_and_v2_optimizers(),
-          combinations.combine(mode=["graph", "eager"])))
-  def testOptimizerInsideModelFn(self, distribution, optimizer_fn):
+          combinations.combine(mode=["graph", "eager"], is_tpu=[False])) +
+      combinations.combine(
+          distribution=[combinations.tpu_strategy],
+          optimizer_fn=[
+              combinations.adam_optimizer_v1_fn,
+              combinations.gradient_descent_optimizer_v1_fn
+          ],
+          mode=["graph"],
+          is_tpu=[True]))
+
+  def testOptimizerInsideModelFn(self, distribution, optimizer_fn, is_tpu):
     created_variables = []
     trainable_variables = []
 
@@ -128,11 +137,17 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
 
       if not context.executing_eagerly():
         with self.test_session() as sess:
+          if is_tpu:
+            sess.run(tpu.initialize_system())
           run_step = sess.make_callable(run_step())
         self.evaluate(variables_lib.global_variables_initializer())
 
       run_step()
 
+      if is_tpu:
+        with self.test_session() as sess:
+          sess.run(tpu.shutdown_system())
+
       def get_expected_variables(optimizer_fn, num_parameter_devices):
         variables_map = {
             "GradientDescent": ["dense/kernel", "dense/bias"],
diff --git a/tensorflow/contrib/distribute/python/single_loss_example.py b/tensorflow/contrib/distribute/python/single_loss_example.py
index abd13c6cc6..0db0b59fca 100644
--- a/tensorflow/contrib/distribute/python/single_loss_example.py
+++ b/tensorflow/contrib/distribute/python/single_loss_example.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.data.python.ops import batching
 from tensorflow.contrib.distribute.python import step_fn
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
@@ -54,7 +55,11 @@ def minimize_loss_example(optimizer_fn,
   """Example of non-distribution-aware legacy code."""
 
   def dataset_fn():
-    return dataset_ops.Dataset.from_tensors([[1.]]).repeat().batch(2)
+    dataset = dataset_ops.Dataset.from_tensors([[1.]]).repeat()
+    # TODO(isaprykin): map_and_batch with drop_remainder causes shapes to be
+    # fully defined for TPU.  Remove this when XLA supports dynamic shapes.
+    return dataset.apply(
+        batching.map_and_batch(lambda x: x, batch_size=2, drop_remainder=True))
 
   # An Optimizer instance is created either outside or inside model_fn.
   outer_optimizer = None
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index ceb52ceca7..a7e4fe80f3 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -21,15 +21,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import itertools
+
 from tensorflow.contrib import tpu
 from tensorflow.contrib.distribute.python import one_device_strategy
 from tensorflow.contrib.distribute.python import values
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.util import nest
 
 
 # TODO(isaprykin):  Consider whether inheriting is really appropriate.
@@ -37,48 +38,53 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
   """Experimental TPU distribution strategy implementation."""
 
   def __init__(self,
-               global_batch_size=2,
                num_cores_per_host=2,
                iterations_per_step=2):
-    # TODO(isaprykin): Generalize the defaults.
+    # TODO(isaprykin): Generalize the defaults.  They are currently tailored for
+    # the unit test.
     super(TPUStrategy, self).__init__('/cpu:0')
     # TODO(isaprykin): Auto-detect number of cores and hosts.
     self._num_cores_per_host = num_cores_per_host
-    self._global_batch_size = global_batch_size
     # TODO(isaprykin): This might have to be per-call.
     self._iterations_per_step = iterations_per_step
 
   def distribute_dataset(self, dataset_fn):
     return values.PerIterationDataset(
-        self._call_dataset_fn(dataset_fn), self._iterations_per_step)
+        self._call_dataset_fn(dataset_fn), self._iterations_per_step,
+        self._num_cores_per_host)
 
   def _call_for_each_tower(self, fn, *args, **kwargs):
     kwargs.pop('run_concurrently', None)
 
-    # TODO(isaprykin): Support variable arguments similar to PerDevice+regroup.
-    inputs = args[0]
+    inputs = {'args': args, 'kwargs': kwargs}
+    flat_inputs = nest.flatten(inputs)
+
+    feed_mask = [isinstance(f, values.PerIteration) for f in flat_inputs]
 
-    sharded_shape = [None]  # Python 2 nonlocal.
+    feeds = lambda: itertools.compress(flat_inputs, feed_mask)
+    shapes = [f.get_shape() for f in feeds()]
+    if any([not s.is_fully_defined() for s in shapes]):
+      raise ValueError(
+          'TPU currently requires fully defined shapes. Either use '
+          'set_shape() on the input tensors or use '
+          'dataset.apply(map_and_batch(..., drop_remainder=True)).')
+    types = [f.get_dtype() for f in feeds()]
 
     def infeed_input(i):
       """Get input, split it and then enqueue."""
-      batches = array_ops.gather(inputs, i)
+      iteration_inputs = [f.get(i) for f in feeds()]
 
-      # TODO(isaprykin):  Handle partial batch.
-      global_shape = [self._global_batch_size] + list(batches.get_shape())[1:]
-      sharded_shape[0] = ([self._global_batch_size / self._num_cores_per_host] +
-                          list(global_shape)[1:])
+      infeed_inputs = [[inputs_per_core[core_id]
+                        for inputs_per_core in iteration_inputs]
+                       for core_id in range(self._num_cores_per_host)]
 
-      batches.set_shape(global_shape)
-      batches = array_ops.split(batches, self._num_cores_per_host)
+      infeed_ops = []
+      for core_id, infeed_input in enumerate(infeed_inputs):
+        infeed_ops.append(
+            tpu_ops.infeed_enqueue_tuple(
+                inputs=infeed_input, shapes=shapes, device_ordinal=core_id))
 
-      infeeds = [
-          tpu_ops.infeed_enqueue_tuple(
-              inputs=[batches[j]], shapes=[sharded_shape[0]], device_ordinal=j)
-          for j in range(self._num_cores_per_host)
-      ]
-
-      with ops.control_dependencies(infeeds):
+      with ops.control_dependencies(infeed_ops):
         return i + 1
 
     with ops.device('/task:0/device:CPU:0'):
@@ -87,13 +93,21 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
           infeed_input, [constant_op.constant(0)],
           parallel_iterations=1)
 
-    assert sharded_shape[0]
-
     def dequeueing_fn(*args, **kwargs):
+      """Dequeue input arguments and supply them to `fn`."""
       del args, kwargs
-      x, = tpu.infeed_dequeue_tuple(
-          dtypes=[dtypes.float32], shapes=[sharded_shape[0]])
-      return fn(x)
+      dequeued = tpu.infeed_dequeue_tuple(dtypes=types, shapes=shapes)
+      dequeued = iter(dequeued)
+
+      fn_inputs = []
+      for inp, is_feed in zip(flat_inputs, feed_mask):
+        if is_feed:
+          fn_inputs.append(next(dequeued))
+        else:
+          fn_inputs.append(inp)
+
+      fn_inputs = nest.pack_sequence_as(inputs, fn_inputs)
+      return fn(*fn_inputs['args'], **fn_inputs['kwargs'])
 
     def iterate_on_tpu():
       return tpu.repeat(self._iterations_per_step, dequeueing_fn, [])
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index 62016c3a78..8cb5276579 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -570,18 +570,36 @@ class PerDeviceDataset(object):
         dataset_iterator, self._devices, self._prefetch_on_device)
 
 
+class PerIteration(object):
+  """Holds input for multiple iterations at once."""
+
+  def __init__(self, index):
+    self._index = index
+
+  def get(self, iteration):
+    return array_ops.gather(self._index, iteration)
+
+  def get_shape(self):
+    return self._index[-1][-1].get_shape()
+
+  def get_dtype(self):
+    return self._index[-1][-1].dtype
+
+
 class MultiIterator(object):
   """Iterator that returns results of multiple get_next()s."""
 
-  def __init__(self, dataset_iterator, iterations):
+  def __init__(self, dataset_iterator, iterations, batches_per_iteration):
     self._dataset_iterator = dataset_iterator
     self._iterations = iterations
+    self._batches_per_iteration = batches_per_iteration
 
   def get_next(self, name=None):
-    return [
+    return PerIteration([[
         self._dataset_iterator.get_next(name=name)
-        for _ in range(self._iterations)
+        for _ in range(self._batches_per_iteration)
     ]
+                         for _ in range(self._iterations)])
 
   @property
   def initializer(self):
@@ -589,18 +607,22 @@ class MultiIterator(object):
 
 
 class PerIterationDataset(object):
+  """A dataset that returns MultiIterators."""
 
-  def __init__(self, dataset, iterations):
+  def __init__(self, dataset, iterations, batches_per_iteration):
     self._dataset = dataset
     self._iterations = iterations
+    self._batches_per_iteration = batches_per_iteration
 
   def make_one_shot_iterator(self):
     iterator = self._dataset.make_one_shot_iterator()
-    return MultiIterator(iterator, self._iterations)
+    return MultiIterator(iterator, self._iterations,
+                         self._batches_per_iteration)
 
   def make_initializable_iterator(self):
     iterator = self._dataset.make_initializable_iterator()
-    return MultiIterator(iterator, self._iterations)
+    return MultiIterator(iterator, self._iterations,
+                         self._batches_per_iteration)
 
 
 class MapOutput(object):
-- 
GitLab


From 8723770b4cbcac0a528354d8508a5ef83716d1fa Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 19 Apr 2018 19:27:34 -0700
Subject: [PATCH 203/450] [XLA] Remove default argument on virtual function
 DeviceMemoryAllocator::Allocate().

Default args on virtual functions are disallowed by the Google style
guide, for good reason.  They have the extremely surprising behavior
that the defaults you get when calling a function on a pointer depend
not on the underlying type of the object, but on whatever is the
semantic type of the pointer!

PiperOrigin-RevId: 193611213
---
 .../xla/service/device_memory_allocator.h     | 30 ++++++++++++++-----
 .../xla/tests/local_client_test_base.cc       |  3 +-
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/compiler/xla/service/device_memory_allocator.h
index 240acf8973..da45c4d45a 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.h
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.h
@@ -38,13 +38,25 @@ class DeviceMemoryAllocator {
   virtual ~DeviceMemoryAllocator() {}
 
   // 'retry_on_failure': If false, and the first attempt to allocate the memory
-  // fails, the allocation should return immediately without retrying.
-  // An example use case is optional scratch spaces where a failure
-  // has only performance impact.
+  // fails, the allocation should return immediately without retrying.  An
+  // example use case is optional scratch spaces where a failure has only
+  // performance impact.
+  //
   // Allocate() should return a null pointer for a size-0 allocation.
   // Deallocate() must be a no-op for null pointers.
-  virtual StatusOr<se::DeviceMemoryBase> Allocate(
-      int device_ordinal, uint64 size, bool retry_on_failure = true) = 0;
+  virtual StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal,
+                                                  uint64 size,
+                                                  bool retry_on_failure) = 0;
+
+  // Two-arg version of Allocate(), which sets retry-on-failure to true.
+  //
+  // (We don't simply use a default argument on the virtual Allocate function
+  // because default args on virtual functions are disallowed by the Google
+  // style guide.)
+  StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size) {
+    return Allocate(device_ordinal, size, /*retry_on_failure=*/true);
+  }
+
   virtual tensorflow::Status Deallocate(int device_ordinal,
                                         se::DeviceMemoryBase* mem) = 0;
 
@@ -67,8 +79,12 @@ class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
       const se::Platform* platform,
       tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors);
 
-  StatusOr<se::DeviceMemoryBase> Allocate(
-      int device_ordinal, uint64 size, bool retry_on_failure = true) override;
+  StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size,
+                                          bool retry_on_failure) override;
+
+  // Pull in two-arg overload that sets retry_on_failure to true.
+  using DeviceMemoryAllocator::Allocate;
+
   tensorflow::Status Deallocate(int device_ordinal,
                                 se::DeviceMemoryBase* mem) override;
 
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index c60ba2422f..bb5aabb214 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -44,7 +44,8 @@ StatusOr<se::DeviceMemoryBase> TestAllocator::Allocate(int device_ordinal,
     allocation_count_++;
     device_allocation_count_[device_ordinal]++;
   }
-  return StreamExecutorMemoryAllocator::Allocate(device_ordinal, size);
+  return StreamExecutorMemoryAllocator::Allocate(device_ordinal, size,
+                                                 retry_on_failure);
 }
 
 tensorflow::Status TestAllocator::Deallocate(int device_ordinal,
-- 
GitLab


From 2a956c9b8f9950405b481ccc0e05636873ecc9ae Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 20 Apr 2018 02:40:37 +0000
Subject: [PATCH 204/450] Support string tensors for tf.count_nonzero

This fix tries to address the issue raised in 18712 where
`tf.count_nonzero` does not support string tensors.

The implementation of `tf.count_nonzero` relies on `tf.not_equal`
which actually support string tensors. The reason the string
tensor does not work is because `tf.count_nonzero` created
a numpy type `zero` which uses `input_tensor.dtype.as_numpy_dtype()`.
The numpy type `zero` is then passed to `tf.not_equal (which converts
numpy `zero` into a tensor zero). However,
`input_tensor.dtype.as_numpy_dtype()` will converts tf.string to
numpy.object thus the exception.

But that is not necessary as `zero` could be created
with `tf.zeros` directly without back and forth conversion
to numpy.

This fix fixes the issue.

This fix fixes 18712.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 781b1c557f..8c9ad66b0e 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1487,7 +1487,8 @@ def count_nonzero(input_tensor,
 
   with ops.name_scope(name, "count_nonzero", [input_tensor]):
     input_tensor = ops.convert_to_tensor(input_tensor, name="input_tensor")
-    zero = input_tensor.dtype.as_numpy_dtype()
+    # A scalar of 'zero' is enough as `not_equal` will broadcast.
+    zero = array_ops.zeros([], dtype=input_tensor.dtype)
     return cast(
         reduce_sum(
             # int64 reduction happens on GPU
-- 
GitLab


From 37999ce500f27d587100f0bf45e87957936f5ada Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 20 Apr 2018 02:48:15 +0000
Subject: [PATCH 205/450] Add test case for tf.string support with
 tf.count_nonzero

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/reduction_ops_test.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 589ea54973..0be89e1ff4 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -958,6 +958,12 @@ class CountNonzeroReductionTest(test.TestCase):
           y = math_ops.count_nonzero(x, [0])
           self.assertAllEqual(y.eval(), np.zeros(9938))
 
+  def testStringReduce(self):
+    # Test case for GitHub issue 18712
+    with self.test_session() as sess:
+      v = math_ops.count_nonzero(constant_op.constant(["test"]))
+      self.assertAllClose(sess.run(v), 1)
+
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From 7358025743951b42fe0f99fb85b4418769de5357 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 20 Apr 2018 02:51:54 +0000
Subject: [PATCH 206/450] Add test cases with axis and keepdims for
 tf.count_nonzero and string

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../python/kernel_tests/reduction_ops_test.py       | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 0be89e1ff4..943b80b787 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -889,9 +889,9 @@ class AnyReductionTest(test.TestCase):
 
 class CountNonzeroReductionTest(test.TestCase):
 
-  def _compare(self, x, reduction_axes, keepdims, use_gpu=False,
+  def _compare(self, x, reduction_axes, keepdims, use_gpu=False, zero=0,
                feed_dict=None):
-    np_ans = (x != 0).astype(np.int32)
+    np_ans = (x != zero).astype(np.int32)
     if reduction_axes is None:
       np_ans = np.sum(np_ans, keepdims=keepdims)
     else:
@@ -964,6 +964,15 @@ class CountNonzeroReductionTest(test.TestCase):
       v = math_ops.count_nonzero(constant_op.constant(["test"]))
       self.assertAllClose(sess.run(v), 1)
 
+  def testStringReduce1D(self):
+    # Create a 1D array of strings
+    x = np.asarray(["", "", "a", "", "", "b"])
+    self._compare(x, None, keepdims=False, zero=np.str(""))
+    self._compare(x, [], keepdims=False, zero=np.str(""))
+    self._compare(x, [0], keepdims=False, zero=np.str(""))
+    self._compare(x, None, keepdims=True, zero=np.str(""))
+    self._compare(x, [], keepdims=True, zero=np.str(""))
+    self._compare(x, [0], keepdims=True, zero=np.str(""))
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From 01ab85f0fdce13f98b705c54901284a165ed7bd8 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 20 Apr 2018 02:53:57 +0000
Subject: [PATCH 207/450] Add n-D test cases for better coverage

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../python/kernel_tests/reduction_ops_test.py    | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 943b80b787..ea78b58d88 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -974,5 +974,21 @@ class CountNonzeroReductionTest(test.TestCase):
     self._compare(x, [], keepdims=True, zero=np.str(""))
     self._compare(x, [0], keepdims=True, zero=np.str(""))
 
+  def testStringReduce2D(self):
+    # Create a 2D array of strings
+    x = np.asarray([["", "", "a", "", "", "b"],
+                    ["", "c", "", "d", "", ""],
+                    ["e", "", "f", "", "", ""]])
+    self._compare(x, None, keepdims=False, zero=np.str(""))
+    self._compare(x, [], keepdims=False, zero=np.str(""))
+    self._compare(x, [0], keepdims=False, zero=np.str(""))
+    self._compare(x, [1], keepdims=False, zero=np.str(""))
+    self._compare(x, [0, 1], keepdims=False, zero=np.str(""))
+    self._compare(x, None, keepdims=True, zero=np.str(""))
+    self._compare(x, [], keepdims=True, zero=np.str(""))
+    self._compare(x, [0], keepdims=True, zero=np.str(""))
+    self._compare(x, [0, 1], keepdims=True, zero=np.str(""))
+
+
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From 38dcc57681612c2321169367c8756bb218472dd7 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Thu, 19 Apr 2018 19:56:09 -0700
Subject: [PATCH 208/450] Revert part of
 tensorflow/core/grappler/optimizers/meta_optimizer.cc from #18479.

---
 .../grappler/optimizers/meta_optimizer.cc     | 22 +------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index bca779c3b3..22799311bc 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -168,26 +168,6 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
   TF_RETURN_IF_ERROR(register_by_name ? InitializeOptimizersByName(&optimizers)
                                       : InitializeOptimizers(&optimizers));
 
-  // Append custom configurable optimizers.
-  std::vector<tensorflow::RewriterConfig_CustomGraphOptimizer>
-      custom_configurable_optimizers;
-  for (const auto& optimizer : cfg_.custom_optimizers()) {
-    if (available_optimizers.find(optimizer.name()) !=
-        available_optimizers.end()) {
-      optimizers.push_back(NewOptimizer(optimizer.name()));
-    } else {
-      custom_configurable_optimizers.push_back(optimizer);
-    }
-  }
-  // Now initialize and configure the custom optimizers.
-  for (const auto& optimizer : custom_configurable_optimizers) {
-    std::unique_ptr<CustomGraphOptimizer> opt =
-        CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer.name());
-    if (opt == nullptr) continue;
-    TF_RETURN_IF_ERROR(opt->Init(&optimizer));
-    optimizers.push_back(std::move(opt));
-  }
-
   if (optimizers.empty()) {
     *optimized_graph = item.graph;
     return Status::OK();
@@ -341,7 +321,7 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
          cfg.auto_parallel().enable() ||
          cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT ||
          cfg.debug_stripper() == RewriterConfig::ON ||
-         !cfg.optimizers().empty() || !cfg.custom_optimizers().empty();
+         !cfg.optimizers().empty();
 }
 
 Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
-- 
GitLab


From 4ef9de422d452683ac661d3a6313aeb2972b836d Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 19 Apr 2018 20:00:21 -0700
Subject: [PATCH 209/450] Always include the local worker in the list of
 filtered targets.

It is currently legal to specify a device filter that doesn't include the local worker.
In that case, the MasterSession includes all local devices regardless of the filter.
This change extends this behavior to the list of filtered workers, which will be crucial for backwards compatibility when we enable CreateWorkerSession for all MasterSessions, because we need to call CreateWorkerSession on all potential workers.

PiperOrigin-RevId: 193613313
---
 tensorflow/core/distributed_runtime/master.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index 288656e7f8..e60386fd34 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -167,13 +167,16 @@ class DeviceFinder {
     }
     // Enumerates all known workers' target. A target name is a
     // prefix of a device name. E.g., /job:mnist/replica:0/task:10.
+    CHECK_GT(env_->local_devices.size(), 0) << "No local devices provided.";
+    const string& local_device_name = env_->local_devices[0]->name();
     std::vector<string> workers;
     worker_cache->ListWorkers(&workers);
     if (filters_.empty()) {
       std::swap(workers, targets_);
     } else {
       for (const string& name : workers) {
-        if (MatchFilters(name)) {
+        if (MatchFilters(name) ||
+            DeviceNameUtils::IsSameAddressSpace(name, local_device_name)) {
           targets_.push_back(name);
         }
       }
-- 
GitLab


From ddd763de08c5095d9a0dbb8acceb82135c0aa485 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Fri, 20 Apr 2018 11:08:34 +0800
Subject: [PATCH 210/450] Fix unwanted typo caused protobuf load failure

---
 tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
index 743247bb60..ad0aeac004 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
@@ -80,4 +80,5 @@ $$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
 $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
 $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
 $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+END
 }
-- 
GitLab


From 7f3baa210a45cd0b41e21b63c2be6dd54230ea0b Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 20 Apr 2018 02:55:31 +0000
Subject: [PATCH 211/450] Update doc string for tf.count_nonzero to add string
 type

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 8c9ad66b0e..31ce83905b 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1467,7 +1467,8 @@ def count_nonzero(input_tensor,
   ```
 
   Args:
-    input_tensor: The tensor to reduce. Should be of numeric type, or `bool`.
+    input_tensor: The tensor to reduce. Should be of numeric type, `string`,
+      or `bool`.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-- 
GitLab


From 2273c4e56334caf31de01c6b6f8f4edd48432972 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Thu, 19 Apr 2018 21:33:41 -0700
Subject: [PATCH 212/450] Skip tests with no_oss tag in XLA builds.

PiperOrigin-RevId: 193619344
---
 tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
index a94a627dfb..a410c10b61 100755
--- a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
+++ b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
@@ -35,7 +35,7 @@ echo "build --distinct_host_configuration=false" >> .tf_configure.bazelrc
 
 bazel clean
 # Run bazel test command. Double test timeouts to avoid flakes.
-bazel test --config=cuda --test_tag_filters=-no_gpu,-benchmark-test -k \
+bazel test --config=cuda --test_tag_filters=-no_gpu,-benchmark-test,-no_oss -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
     --build_tests_only --test_output=errors --local_test_jobs=8 \
     --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-- 
GitLab


From 06bb3364795e443206910c98cee132d719cf41e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Fri, 20 Apr 2018 13:33:05 +0800
Subject: [PATCH 213/450] TST: byte string for python3

---
 .../python/kernel_tests/scatter_nd_ops_test.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index dfe9600dbb..b7477a768a 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -365,31 +365,35 @@ class ScatterNdTest(test.TestCase):
     return array_ops.scatter_nd(indices, updates, shape)
 
   def testString(self):
-    indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
+    indices = constant_op.constant([[4], [3], [1], [7]],
+                                   dtype=dtypes.int32)
     updates = constant_op.constant(["four", "three", "one", "seven"],
                                    dtype=dtypes.string)
-    expected = np.array(["", "one", "", "three", "four", "", "", "seven"])
+    expected = np.array([b"", b"one", b"", b"three", b"four",
+                         b"", b"", b"seven"])
     scatter = self.scatter_nd(indices, updates, shape=(8,))
     with self.test_session() as sess:
       result = sess.run(scatter)
       self.assertAllEqual(expected, result)
 
     # Same indice is updated twice by same value.
-    indices = constant_op.constant([[4], [3], [3], [7]], dtype=dtypes.int32)
+    indices = constant_op.constant([[4], [3], [3], [7]],
+                                   dtype=dtypes.int32)
     updates = constant_op.constant(["a", "b", "b", "c"],
                                    dtype=dtypes.string)
-    expected = np.array(["", "", "", "bb", "a", "", "", "c"])
+    expected = np.array([b"", b"", b"", b"bb", b"a", b"", b"", b"c"])
     scatter = self.scatter_nd(indices, updates, shape=(8,))
     with self.test_session() as sess:
       result = sess.run(scatter)
       self.assertAllEqual(expected, result)
 
     # Same indice is updated twice by different value.
-    indices = constant_op.constant([[4], [3], [3], [7]], dtype=dtypes.int32)
+    indices = constant_op.constant([[4], [3], [3], [7]],
+                                   dtype=dtypes.int32)
     updates = constant_op.constant(["a", "b", "c", "d"],
                                    dtype=dtypes.string)
-    expected = [np.array(["", "", "", "bc", "a", "", "", "d"]),
-                np.array(["", "", "", "cb", "a", "", "", "d"])]
+    expected = [np.array([b"", b"", b"", b"bc", b"a", b"", b"", b"d"]),
+                np.array([b"", b"", b"", b"cb", b"a", b"", b"", b"d"])]
     scatter = self.scatter_nd(indices, updates, shape=(8,))
     with self.test_session() as sess:
       result = sess.run(scatter)
-- 
GitLab


From 70b8d21edcc84818835c9e2940a5df288c309d45 Mon Sep 17 00:00:00 2001
From: Roy Frostig <frostig@google.com>
Date: Thu, 19 Apr 2018 23:01:07 -0700
Subject: [PATCH 214/450] [XLA] Rework the local XLA client's Shape class with
 separate array and tuple shape constructors.

PiperOrigin-RevId: 193624591
---
 .../compiler/xla/python/numpy_bridge.cc       |  20 +--
 tensorflow/compiler/xla/python/xla_client.py  | 137 ++++++++++++------
 .../compiler/xla/python/xla_client_test.py    |  10 +-
 3 files changed, 103 insertions(+), 64 deletions(-)

diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc
index eec48479c9..dc6f5fe5fc 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.cc
+++ b/tensorflow/compiler/xla/python/numpy_bridge.cc
@@ -181,16 +181,6 @@ StatusOr<Shape> XlaShapeFromPyShape(PyObject* o) {
                            PyObjectCppRepr(o).c_str());
   };
 
-  auto get_attr = [o, &error](const string& field) -> StatusOr<PyObject*> {
-    PyObject* result =
-        PyObject_GetAttrString(o, const_cast<char*>(field.c_str()));
-    if (result == nullptr) {
-      return error(tensorflow::strings::StrCat(
-          "Failed to get attribute of Shape object:", field));
-    }
-    return result;
-  };
-
   auto call_method = [o, &error](const string& method) -> StatusOr<PyObject*> {
     PyObject* result =
         PyObject_CallMethod(o, const_cast<char*>(method.c_str()), nullptr);
@@ -202,12 +192,16 @@ StatusOr<Shape> XlaShapeFromPyShape(PyObject* o) {
   };
 
   PyObject* np_type;
-  TF_ASSIGN_OR_RETURN(np_type, get_attr("np_dtype"));
+  TF_ASSIGN_OR_RETURN(np_type, call_method("numpy_dtype"));
   if (np_type->ob_type != &PyArrayDescr_Type) {
-    return error("Shape attribute np_dtype is not an integer numpy dtype");
+    return error(
+        "Return value of shape method numpy_dtype "
+        "is not an integer numpy dtype");
   }
   if (!NumpyTypeIsValid(NumpyTypenum(np_type))) {
-    return error("Shape attribute np_dtype is not a valid integer numpy dtype");
+    return error(
+        "Return value of shape method numpy_dtype "
+        "is not a valid integer numpy dtype");
   }
   const PrimitiveType element_type =
       NumpyTypeToPrimitiveType(NumpyTypenum(np_type));
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 9c81f6439d..f6809b6b87 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -166,14 +166,14 @@ class LocalBuffer(object):
     self._delete = c_api.DeleteLocalShapedBuffer
 
   @staticmethod
-  def from_py(npval, layout_fn=None):
-    npval = require_numpy_array_layout(npval)
+  def from_pyval(pyval, layout_fn=None):
+    pyval = require_numpy_array_layout(pyval)
     if layout_fn:
-      shape = Shape.from_numpy(npval)
+      shape = Shape.from_pyval(pyval)
       shape = shape.map_leaves(layout_fn)
     else:
       shape = None
-    return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(npval, shape))
+    return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(pyval, shape))
 
   def to_py(self):
     return self.c_local_shaped_buffer.ToLiteral()
@@ -191,53 +191,104 @@ class LocalBuffer(object):
 
 
 class Shape(object):
-  """XLA shape.
+  """Represents an XLA shape.
 
-  Represents an XLA shape by a corresponding Python/Numpy type and a
-  list of dimensions, which are themselves Shapes in case this one
-  represents an XLA tuple.
+  A shape is either an array shape, having rank-many integer
+  dimensions and an element type (represented by a Numpy dtype), or it
+  is a tuple shape, having a shape for every tuple component:
+
+    type shape =
+        TupleShape of shape list
+      | ArrayShape of { dimensions: int list; element_type: dtype }
+
+  Callers are expected to instantiate this class only via the static
+  constructors: tuple_shape, array_shape, and from_pyval.
   """
 
-  def __init__(self, np_dtype, dimensions, minor_to_major=None):
+  @staticmethod
+  def tuple_shape(tuple_shapes):
+    """Construct a tuple shape."""
+    if (not isinstance(tuple_shapes, (tuple, list)) or
+        not all(isinstance(t, Shape) for t in tuple_shapes)):
+      raise TypeError('tuple_shapes must be a tuple of Shapes')
+    return Shape(tuple_shapes, tuple)
+
+  @staticmethod
+  def array_shape(element_type, dimensions, minor_to_major=None):
+    """Construct an array shape."""
+    if (not isinstance(dimensions, tuple) or
+        not all(isinstance(i, int) for i in dimensions)):
+      dimensions = tuple(int(i) for i in dimensions)
+    return Shape(dimensions, np.dtype(element_type),
+                 minor_to_major=minor_to_major)
+
+  @staticmethod
+  def from_pyval(pyval):
+    def convert(pyval):
+      if isinstance(pyval, tuple):
+        return Shape.tuple_shape(tuple(convert(elt) for elt in pyval))
+      else:
+        pyval = require_numpy_array_layout(pyval)
+        return Shape.array_shape(pyval.dtype, np.shape(pyval))
+    return convert(pyval)
+
+  def __init__(self, dimensions, dtype, minor_to_major=None):
     assert isinstance(dimensions, tuple)
-    self.np_dtype = np_dtype
     self._dimensions = dimensions
+    self._dtype = dtype
+    self._is_tuple = dtype == tuple
     self._minor_to_major = minor_to_major
     self._check_minor_to_major()
 
   def __eq__(self, other):
     # pylint: disable=protected-access
-    return (self.np_dtype == other.np_dtype and
+    return (self._dtype == other._dtype and
             self._dimensions == other._dimensions and
             self._minor_to_major == other._minor_to_major)
 
   def __repr__(self):
-    return ('xla_client.Shape(np_dtype={!r}, dimensions={!r}, '
-            'minor_to_major={!r})').format(self.np_dtype, self._dimensions,
-                                           self._minor_to_major)
-
-  def element_type(self):
-    return DTYPE_TO_XLA_ELEMENT_TYPE[str(self.np_dtype)]
+    return ('xla_client.Shape(_dtype={!r}, _dimensions={!r}, '
+            '_is_tuple={!r}), _minor_to_major={!r}').format(
+                self._dtype, self._dimensions, self._is_tuple,
+                self._minor_to_major)
 
   def is_tuple(self):
-    return self.element_type() == xla_data_pb2.TUPLE
+    return self._is_tuple
 
-  def dimensions(self):
-    if self.is_tuple():
-      raise ValueError('Tuple shape has no dimensions')
-    return self._dimensions
-
-  def minor_to_major(self):
-    return self._minor_to_major
+  def is_array(self):
+    return not self._is_tuple
 
   def tuple_shapes(self):
     if not self.is_tuple():
-      raise ValueError('Shape is not a tuple shape')
+      raise ValueError('not a tuple shape')
+    return self._dimensions
+
+  def numpy_dtype(self):
+    """Like element_type(), but returns dtype('O') in case of a tuple shape."""
+    if self.is_tuple():
+      return np.dtype(np.object)
+    else:
+      return self.element_type()
+
+  def xla_element_type(self):
+    return DTYPE_TO_XLA_ELEMENT_TYPE[str(self.numpy_dtype())]
+
+  def element_type(self):
+    if not self.is_array():
+      raise ValueError('not an array shape')
+    return self._dtype
+
+  def dimensions(self):
+    if not self.is_array():
+      raise ValueError('not an array shape')
     return self._dimensions
 
   def rank(self):
     return len(self.dimensions())
 
+  def minor_to_major(self):
+    return self._minor_to_major
+
   def map_leaves(self, f):
     """Map f over each leaf-level array subshape.
 
@@ -250,7 +301,7 @@ class Shape(object):
     """
     if self.is_tuple():
       children = tuple(child.map_leaves(f) for child in self.tuple_shapes())
-      return Shape(np.dtype('O'), children)
+      return Shape.tuple_shape(children)
     else:
       mapped = f(self)
       return self if mapped is None else mapped
@@ -264,30 +315,24 @@ class Shape(object):
       assert sorted(mtm) == range(len(mtm)), self
 
   def update_minor_to_major(self, minor_to_major):
+    if not self.is_array():
+      raise ValueError('not an array shape')
     if not isinstance(minor_to_major, tuple):
       raise TypeError('minor_to_major must be a tuple')
-    updated = Shape(self.np_dtype, tuple(self.dimensions()), minor_to_major)
+    updated = Shape.array_shape(
+        self.element_type(), self.dimensions(), minor_to_major)
     updated._check_minor_to_major()  # pylint: disable=protected-access
     return updated
 
-  @staticmethod
-  def from_numpy(npval):
-
-    def convert(npval):
-      if isinstance(npval, tuple):
-        return Shape(np.dtype('O'), tuple(convert(elt) for elt in npval))
-      else:
-        return Shape(npval.dtype, np.shape(npval))
-
-    return convert(require_numpy_array_layout(npval))
-
 
 def _wrap_shape(shape_info):
   dtype, dims = shape_info
   element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(dtype)]
   if element_type == xla_data_pb2.TUPLE:
-    dims = tuple(_wrap_shape(subshape_info) for subshape_info in dims)
-  return Shape(dtype, dims)
+    shapes = tuple(_wrap_shape(subshape_info) for subshape_info in dims)
+    return Shape.tuple_shape(shapes)
+  else:
+    return Shape.array_shape(dtype, dims)
 
 
 def _wrap_data_handle(handle):
@@ -420,7 +465,7 @@ class LocalComputation(object):
                                   compile_options=None,
                                   layout_fn=None):
     return self.Compile(
-        argument_shapes=[Shape.from_numpy(arg) for arg in arguments],
+        argument_shapes=[Shape.from_pyval(arg) for arg in arguments],
         compile_options=compile_options,
         layout_fn=layout_fn)
 
@@ -428,7 +473,7 @@ class LocalComputation(object):
     """Execute with Python values as arguments and return value."""
     if not self.is_compiled:
       raise ValueError('Cannot execute an uncompiled local XLA computation.')
-    argument_shapes = [Shape.from_numpy(arg) for arg in arguments]
+    argument_shapes = [Shape.from_pyval(arg) for arg in arguments]
     if layout_fn:
       argument_shapes = [
           shape.map_leaves(layout_fn) for shape in argument_shapes
@@ -607,7 +652,7 @@ class ComputationBuilder(object):
       A ComputationDataHandle message.
     """
     return self.ParameterWithShape(
-        Shape.from_numpy(value), name=name, parameter_num=parameter_num)
+        Shape.from_pyval(value), name=name, parameter_num=parameter_num)
 
   def Broadcast(self, operand, sizes):
     """Enqueues a broadcast operation onto the computation.
@@ -968,7 +1013,7 @@ class ComputationBuilder(object):
 
     Returns: a ComputationDataHandle to the generated array of F32 values.
     """
-    shape = Shape(self.GetShape(mu).np_dtype, dims)
+    shape = Shape.array_shape(self.GetShape(mu).element_type(), dims)
     return _wrap_data_handle(
         self._client.RngNormal(
             _unwrap_data_handle(mu), _unwrap_data_handle(sigma), shape))
@@ -988,7 +1033,7 @@ class ComputationBuilder(object):
     Returns: a ComputationDataHandle to the generated array of values with the
       same numeric type (F32, S32, or U32) as the arguments a and b.
     """
-    shape = Shape(self.GetShape(a).np_dtype, dims)
+    shape = Shape.array_shape(self.GetShape(a).element_type(), dims)
     return _wrap_data_handle(
         self._client.RngUniform(
             _unwrap_data_handle(a), _unwrap_data_handle(b), shape))
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index d97264ea64..6fe7b242e4 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -319,7 +319,7 @@ class LocalBufferTest(LocalComputationTest):
 
   def _Execute(self, c, arguments):
     compiled_c = c.Build().CompileWithExampleArguments(arguments)
-    arg_buffers = [xla_client.LocalBuffer.from_py(arg) for arg in arguments]
+    arg_buffers = [xla_client.LocalBuffer.from_pyval(arg) for arg in arguments]
     result_buffer = compiled_c.ExecuteWithLocalBuffers(arg_buffers)
     return result_buffer.to_py()
 
@@ -350,7 +350,7 @@ class LocalBufferTest(LocalComputationTest):
     c.Add(c.ParameterFromNumpy(NumpyArrayF32(0.)), c.ConstantF32Scalar(3.14))
     arg = NumpyArrayF32(1.11)
     compiled_c = c.Build().CompileWithExampleArguments([arg])
-    arg_buffer = xla_client.LocalBuffer.from_py(arg)
+    arg_buffer = xla_client.LocalBuffer.from_pyval(arg)
     arg_buffer.delete()
     with self.assertRaises(ValueError):
       compiled_c.ExecuteWithLocalBuffers([arg_buffer])
@@ -1288,7 +1288,7 @@ class EmbeddedComputationsTest(LocalComputationTest):
   def testInfeedS32Values(self):
     to_infeed = NumpyArrayS32([1, 2, 3, 4])
     c = self._NewComputation()
-    c.Infeed(xla_client.Shape.from_numpy(to_infeed[0]))
+    c.Infeed(xla_client.Shape.from_pyval(to_infeed[0]))
     compiled_c = c.Build().CompileWithExampleArguments()
     for item in to_infeed:
       xla_client.transfer_to_infeed(item)
@@ -1300,7 +1300,7 @@ class EmbeddedComputationsTest(LocalComputationTest):
   def testInfeedThenOutfeedS32(self):
     to_round_trip = NumpyArrayS32([1, 2, 3, 4])
     c = self._NewComputation()
-    x = c.Infeed(xla_client.Shape.from_numpy(to_round_trip[0]))
+    x = c.Infeed(xla_client.Shape.from_pyval(to_round_trip[0]))
     c.Outfeed(x)
 
     compiled_c = c.Build().CompileWithExampleArguments()
@@ -1310,7 +1310,7 @@ class EmbeddedComputationsTest(LocalComputationTest):
       execution.start()
       xla_client.transfer_to_infeed(want)
       got = xla_client.transfer_from_outfeed(
-          xla_client.Shape.from_numpy(to_round_trip[0]))
+          xla_client.Shape.from_pyval(to_round_trip[0]))
       execution.join()
       self.assertEqual(want, got)
 
-- 
GitLab


From f7e8fbb28a0fa4e979a94d7b458706abf48f7deb Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Thu, 19 Apr 2018 23:08:53 -0700
Subject: [PATCH 215/450] Automated g4 rollback of changelist 193602050

PiperOrigin-RevId: 193625346
---
 tensorflow/core/lib/io/record_reader.cc    | 147 ++++----------
 tensorflow/core/lib/io/record_reader.h     |  16 +-
 tensorflow/core/lib/io/recordio_test.cc    | 212 ++++++++++++++-------
 tensorflow/core/lib/io/zlib_inputstream.cc |  16 +-
 tensorflow/core/lib/io/zlib_inputstream.h  |  19 +-
 5 files changed, 220 insertions(+), 190 deletions(-)

diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index 6de850bb20..c24628be57 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -56,110 +56,55 @@ RecordReaderOptions RecordReaderOptions::CreateRecordReaderOptions(
 
 RecordReader::RecordReader(RandomAccessFile* file,
                            const RecordReaderOptions& options)
-    : src_(file), options_(options) {
+    : options_(options),
+      input_stream_(new RandomAccessInputStream(file)),
+      last_read_failed_(false) {
   if (options.buffer_size > 0) {
-    input_stream_.reset(new BufferedInputStream(file, options.buffer_size));
-  } else {
-    input_stream_.reset(new RandomAccessInputStream(file));
+    input_stream_.reset(new BufferedInputStream(input_stream_.release(),
+                                                options.buffer_size, true));
   }
   if (options.compression_type == RecordReaderOptions::ZLIB_COMPRESSION) {
 // We don't have zlib available on all embedded platforms, so fail.
 #if defined(IS_SLIM_BUILD)
     LOG(FATAL) << "Zlib compression is unsupported on mobile platforms.";
 #else   // IS_SLIM_BUILD
-    zlib_input_stream_.reset(new ZlibInputStream(
-        input_stream_.get(), options.zlib_options.input_buffer_size,
-        options.zlib_options.output_buffer_size, options.zlib_options));
+    input_stream_.reset(new ZlibInputStream(
+        input_stream_.release(), options.zlib_options.input_buffer_size,
+        options.zlib_options.output_buffer_size, options.zlib_options, true));
 #endif  // IS_SLIM_BUILD
   } else if (options.compression_type == RecordReaderOptions::NONE) {
     // Nothing to do.
   } else {
-    LOG(FATAL) << "Unspecified compression type :" << options.compression_type;
+    LOG(FATAL) << "Unrecognized compression type :" << options.compression_type;
   }
 }
 
 // Read n+4 bytes from file, verify that checksum of first n bytes is
 // stored in the last 4 bytes and store the first n bytes in *result.
-// May use *storage as backing store.
-Status RecordReader::ReadChecksummed(uint64 offset, size_t n,
-                                     StringPiece* result, string* storage) {
+//
+// offset corresponds to the user-provided value to ReadRecord()
+// and is used only in error messages.
+Status RecordReader::ReadChecksummed(uint64 offset, size_t n, string* result) {
   if (n >= SIZE_MAX - sizeof(uint32)) {
     return errors::DataLoss("record size too large");
   }
 
   const size_t expected = n + sizeof(uint32);
-  storage->resize(expected);
-
-#if !defined(IS_SLIM_BUILD)
-  if (zlib_input_stream_) {
-    // If we have a zlib compressed buffer, we assume that the
-    // file is being read sequentially, and we use the underlying
-    // implementation to read the data.
-    //
-    // No checks are done to validate that the file is being read
-    // sequentially.  At some point the zlib input buffer may support
-    // seeking, possibly inefficiently.
-    TF_RETURN_IF_ERROR(zlib_input_stream_->ReadNBytes(expected, storage));
-
-    if (storage->size() != expected) {
-      if (storage->empty()) {
-        return errors::OutOfRange("eof");
-      } else {
-        return errors::DataLoss("truncated record at ", offset);
-      }
-    }
+  TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, result));
 
-    uint32 masked_crc = core::DecodeFixed32(storage->data() + n);
-    if (crc32c::Unmask(masked_crc) != crc32c::Value(storage->data(), n)) {
-      return errors::DataLoss("corrupted record at ", offset);
-    }
-    *result = StringPiece(storage->data(), n);
-  } else {
-#endif  // IS_SLIM_BUILD
-    if (options_.buffer_size > 0) {
-      // If we have a buffer, we assume that the file is being read
-      // sequentially, and we use the underlying implementation to read the
-      // data.
-      //
-      // No checks are done to validate that the file is being read
-      // sequentially.
-      TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, storage));
-
-      if (storage->size() != expected) {
-        if (storage->empty()) {
-          return errors::OutOfRange("eof");
-        } else {
-          return errors::DataLoss("truncated record at ", offset);
-        }
-      }
-
-      const uint32 masked_crc = core::DecodeFixed32(storage->data() + n);
-      if (crc32c::Unmask(masked_crc) != crc32c::Value(storage->data(), n)) {
-        return errors::DataLoss("corrupted record at ", offset);
-      }
-      *result = StringPiece(storage->data(), n);
+  if (result->size() != expected) {
+    if (result->empty()) {
+      return errors::OutOfRange("eof");
     } else {
-      // This version supports reading from arbitrary offsets
-      // since we are accessing the random access file directly.
-      StringPiece data;
-      TF_RETURN_IF_ERROR(src_->Read(offset, expected, &data, &(*storage)[0]));
-      if (data.size() != expected) {
-        if (data.empty()) {
-          return errors::OutOfRange("eof");
-        } else {
-          return errors::DataLoss("truncated record at ", offset);
-        }
-      }
-      const uint32 masked_crc = core::DecodeFixed32(data.data() + n);
-      if (crc32c::Unmask(masked_crc) != crc32c::Value(data.data(), n)) {
-        return errors::DataLoss("corrupted record at ", offset);
-      }
-      *result = StringPiece(data.data(), n);
+      return errors::DataLoss("truncated record at ", offset);
     }
-#if !defined(IS_SLIM_BUILD)
   }
-#endif  // IS_SLIM_BUILD
 
+  const uint32 masked_crc = core::DecodeFixed32(result->data() + n);
+  if (crc32c::Unmask(masked_crc) != crc32c::Value(result->data(), n)) {
+    return errors::DataLoss("corrupted record at ", offset);
+  }
+  result->resize(n);
   return Status::OK();
 }
 
@@ -167,50 +112,42 @@ Status RecordReader::ReadRecord(uint64* offset, string* record) {
   static const size_t kHeaderSize = sizeof(uint64) + sizeof(uint32);
   static const size_t kFooterSize = sizeof(uint32);
 
+  // Position the input stream.
+  int64 curr_pos = input_stream_->Tell();
+  int64 desired_pos = static_cast<int64>(*offset);
+  if (curr_pos > desired_pos || curr_pos < 0 /* EOF */ ||
+      (curr_pos == desired_pos && last_read_failed_)) {
+    last_read_failed_ = false;
+    TF_RETURN_IF_ERROR(input_stream_->Reset());
+    TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos));
+  } else if (curr_pos < desired_pos) {
+    TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos - curr_pos));
+  }
+  DCHECK_EQ(desired_pos, input_stream_->Tell());
+
   // Read header data.
-  StringPiece lbuf;
-  Status s = ReadChecksummed(*offset, sizeof(uint64), &lbuf, record);
+  Status s = ReadChecksummed(*offset, sizeof(uint64), record);
   if (!s.ok()) {
+    last_read_failed_ = true;
     return s;
   }
-  const uint64 length = core::DecodeFixed64(lbuf.data());
+  const uint64 length = core::DecodeFixed64(record->data());
 
   // Read data
-  StringPiece data;
-  s = ReadChecksummed(*offset + kHeaderSize, length, &data, record);
+  s = ReadChecksummed(*offset + kHeaderSize, length, record);
   if (!s.ok()) {
+    last_read_failed_ = true;
     if (errors::IsOutOfRange(s)) {
       s = errors::DataLoss("truncated record at ", *offset);
     }
     return s;
   }
 
-  if (record->data() != data.data()) {
-    // RandomAccessFile placed the data in some other location.
-    memmove(&(*record)[0], data.data(), data.size());
-  }
-
-  record->resize(data.size());
-
   *offset += kHeaderSize + length + kFooterSize;
+  DCHECK_EQ(*offset, input_stream_->Tell());
   return Status::OK();
 }
 
-Status RecordReader::SkipNBytes(uint64 offset) {
-#if !defined(IS_SLIM_BUILD)
-  if (zlib_input_stream_) {
-    TF_RETURN_IF_ERROR(zlib_input_stream_->SkipNBytes(offset));
-  } else {
-#endif
-    if (options_.buffer_size > 0) {
-      TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(offset));
-    }
-#if !defined(IS_SLIM_BUILD)
-  }
-#endif
-  return Status::OK();
-}  // namespace io
-
 SequentialRecordReader::SequentialRecordReader(
     RandomAccessFile* file, const RecordReaderOptions& options)
     : underlying_(file, options), offset_(0) {}
diff --git a/tensorflow/core/lib/io/record_reader.h b/tensorflow/core/lib/io/record_reader.h
index 26278e0328..f6d587dfa0 100644
--- a/tensorflow/core/lib/io/record_reader.h
+++ b/tensorflow/core/lib/io/record_reader.h
@@ -69,25 +69,14 @@ class RecordReader {
   // Read the record at "*offset" into *record and update *offset to
   // point to the offset of the next record.  Returns OK on success,
   // OUT_OF_RANGE for end of file, or something else for an error.
-  //
-  // Note: if buffering is used (with or without compression), access must be
-  // sequential.
   Status ReadRecord(uint64* offset, string* record);
 
-  // Skip the records till "offset". Returns OK on success,
-  // OUT_OF_RANGE for end of file, or something else for an error.
-  Status SkipNBytes(uint64 offset);
-
  private:
-  Status ReadChecksummed(uint64 offset, size_t n, StringPiece* result,
-                         string* storage);
+  Status ReadChecksummed(uint64 offset, size_t n, string* result);
 
-  RandomAccessFile* src_;
   RecordReaderOptions options_;
   std::unique_ptr<InputStreamInterface> input_stream_;
-#if !defined(IS_SLIM_BUILD)
-  std::unique_ptr<ZlibInputStream> zlib_input_stream_;
-#endif  // IS_SLIM_BUILD
+  bool last_read_failed_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(RecordReader);
 };
@@ -121,7 +110,6 @@ class SequentialRecordReader {
       return errors::InvalidArgument(
           "Trying to seek offset: ", offset,
           " which is less than the current offset: ", offset_);
-    TF_RETURN_IF_ERROR(underlying_.SkipNBytes(offset - offset_));
     offset_ = offset;
     return Status::OK();
   }
diff --git a/tensorflow/core/lib/io/recordio_test.cc b/tensorflow/core/lib/io/recordio_test.cc
index 63235761d9..da514bd21c 100644
--- a/tensorflow/core/lib/io/recordio_test.cc
+++ b/tensorflow/core/lib/io/recordio_test.cc
@@ -26,10 +26,11 @@ limitations under the License.
 
 namespace tensorflow {
 namespace io {
+namespace {
 
 // Construct a string of the specified length made out of the supplied
 // partial string.
-static string BigString(const string& partial_string, size_t n) {
+string BigString(const string& partial_string, size_t n) {
   string result;
   while (result.size() < n) {
     result.append(partial_string);
@@ -39,62 +40,66 @@ static string BigString(const string& partial_string, size_t n) {
 }
 
 // Construct a string from a number
-static string NumberString(int n) {
+string NumberString(int n) {
   char buf[50];
   snprintf(buf, sizeof(buf), "%d.", n);
   return string(buf);
 }
 
 // Return a skewed potentially long string
-static string RandomSkewedString(int i, random::SimplePhilox* rnd) {
+string RandomSkewedString(int i, random::SimplePhilox* rnd) {
   return BigString(NumberString(i), rnd->Skewed(17));
 }
 
-class RecordioTest : public ::testing::Test {
+class StringDest : public WritableFile {
+ public:
+  explicit StringDest(string* contents) : contents_(contents) {}
+
+  Status Close() override { return Status::OK(); }
+  Status Flush() override { return Status::OK(); }
+  Status Sync() override { return Status::OK(); }
+  Status Append(const StringPiece& slice) override {
+    contents_->append(slice.data(), slice.size());
+    return Status::OK();
+  }
+
  private:
-  class StringDest : public WritableFile {
-   public:
-    string contents_;
-
-    Status Close() override { return Status::OK(); }
-    Status Flush() override { return Status::OK(); }
-    Status Sync() override { return Status::OK(); }
-    Status Append(const StringPiece& slice) override {
-      contents_.append(slice.data(), slice.size());
-      return Status::OK();
+  string* contents_;
+};
+
+class StringSource : public RandomAccessFile {
+ public:
+  explicit StringSource(string* contents)
+      : contents_(contents), force_error_(false) {}
+
+  Status Read(uint64 offset, size_t n, StringPiece* result,
+              char* scratch) const override {
+    if (force_error_) {
+      force_error_ = false;
+      return errors::DataLoss("read error");
     }
-  };
-
-  class StringSource : public RandomAccessFile {
-   public:
-    StringPiece contents_;
-    mutable bool force_error_;
-    mutable bool returned_partial_;
-    StringSource() : force_error_(false), returned_partial_(false) {}
-
-    Status Read(uint64 offset, size_t n, StringPiece* result,
-                char* scratch) const override {
-      EXPECT_FALSE(returned_partial_) << "must not Read() after eof/error";
-
-      if (force_error_) {
-        force_error_ = false;
-        returned_partial_ = true;
-        return errors::DataLoss("read error");
-      }
-
-      if (offset >= contents_.size()) {
-        return errors::OutOfRange("end of file");
-      }
-
-      if (contents_.size() < offset + n) {
-        n = contents_.size() - offset;
-        returned_partial_ = true;
-      }
-      *result = StringPiece(contents_.data() + offset, n);
-      return Status::OK();
+
+    if (offset >= contents_->size()) {
+      return errors::OutOfRange("end of file");
+    }
+
+    if (contents_->size() < offset + n) {
+      n = contents_->size() - offset;
     }
-  };
+    *result = StringPiece(contents_->data() + offset, n);
+    return Status::OK();
+  }
+
+  void force_error() { force_error_ = true; }
+
+ private:
+  string* contents_;
+  mutable bool force_error_;
+};
 
+class RecordioTest : public ::testing::Test {
+ private:
+  string contents_;
   StringDest dest_;
   StringSource source_;
   bool reading_;
@@ -104,7 +109,9 @@ class RecordioTest : public ::testing::Test {
 
  public:
   RecordioTest()
-      : reading_(false),
+      : dest_(&contents_),
+        source_(&contents_),
+        reading_(false),
         readpos_(0),
         writer_(new RecordWriter(&dest_)),
         reader_(new RecordReader(&source_)) {}
@@ -119,12 +126,11 @@ class RecordioTest : public ::testing::Test {
     TF_ASSERT_OK(writer_->WriteRecord(StringPiece(msg)));
   }
 
-  size_t WrittenBytes() const { return dest_.contents_.size(); }
+  size_t WrittenBytes() const { return contents_.size(); }
 
   string Read() {
     if (!reading_) {
       reading_ = true;
-      source_.contents_ = StringPiece(dest_.contents_);
     }
     string record;
     Status s = reader_->ReadRecord(&readpos_, &record);
@@ -137,26 +143,20 @@ class RecordioTest : public ::testing::Test {
     }
   }
 
-  void IncrementByte(int offset, int delta) {
-    dest_.contents_[offset] += delta;
-  }
+  void IncrementByte(int offset, int delta) { contents_[offset] += delta; }
 
-  void SetByte(int offset, char new_byte) {
-    dest_.contents_[offset] = new_byte;
-  }
+  void SetByte(int offset, char new_byte) { contents_[offset] = new_byte; }
 
-  void ShrinkSize(int bytes) {
-    dest_.contents_.resize(dest_.contents_.size() - bytes);
-  }
+  void ShrinkSize(int bytes) { contents_.resize(contents_.size() - bytes); }
 
   void FixChecksum(int header_offset, int len) {
     // Compute crc of type/len/data
-    uint32_t crc = crc32c::Value(&dest_.contents_[header_offset + 6], 1 + len);
+    uint32_t crc = crc32c::Value(&contents_[header_offset + 6], 1 + len);
     crc = crc32c::Mask(crc);
-    core::EncodeFixed32(&dest_.contents_[header_offset], crc);
+    core::EncodeFixed32(&contents_[header_offset], crc);
   }
 
-  void ForceError() { source_.force_error_ = true; }
+  void ForceError() { source_.force_error(); }
 
   void StartReadingAt(uint64_t initial_offset) { readpos_ = initial_offset; }
 
@@ -165,7 +165,6 @@ class RecordioTest : public ::testing::Test {
     Write("bar");
     Write(BigString("x", 10000));
     reading_ = true;
-    source_.contents_ = StringPiece(dest_.contents_);
     uint64 offset = WrittenBytes() + offset_past_end;
     string record;
     Status s = reader_->ReadRecord(&offset, &record);
@@ -217,16 +216,100 @@ TEST_F(RecordioTest, RandomRead) {
   ASSERT_EQ("EOF", Read());
 }
 
+void TestNonSequentialReads(const RecordWriterOptions& writer_options,
+                            const RecordReaderOptions& reader_options) {
+  string contents;
+  StringDest dst(&contents);
+  RecordWriter writer(&dst, writer_options);
+  for (int i = 0; i < 10; ++i) {
+    TF_ASSERT_OK(writer.WriteRecord(NumberString(i))) << i;
+  }
+  TF_ASSERT_OK(writer.Close());
+
+  StringSource file(&contents);
+  RecordReader reader(&file, reader_options);
+
+  string record;
+  // First read sequentially to fill in the offsets table.
+  uint64 offsets[10] = {0};
+  uint64 offset = 0;
+  for (int i = 0; i < 10; ++i) {
+    offsets[i] = offset;
+    TF_ASSERT_OK(reader.ReadRecord(&offset, &record)) << i;
+  }
+
+  // Read randomly: First go back to record #3 then forward to #8.
+  offset = offsets[3];
+  TF_ASSERT_OK(reader.ReadRecord(&offset, &record));
+  EXPECT_EQ("3.", record);
+  EXPECT_EQ(offsets[4], offset);
+
+  offset = offsets[8];
+  TF_ASSERT_OK(reader.ReadRecord(&offset, &record));
+  EXPECT_EQ("8.", record);
+  EXPECT_EQ(offsets[9], offset);
+}
+
+TEST_F(RecordioTest, NonSequentialReads) {
+  TestNonSequentialReads(RecordWriterOptions(), RecordReaderOptions());
+}
+
+TEST_F(RecordioTest, NonSequentialReadsWithReadBuffer) {
+  RecordReaderOptions options;
+  options.buffer_size = 1 << 10;
+  TestNonSequentialReads(RecordWriterOptions(), options);
+}
+
+TEST_F(RecordioTest, NonSequentialReadsWithCompression) {
+  TestNonSequentialReads(
+      RecordWriterOptions::CreateRecordWriterOptions("ZLIB"),
+      RecordReaderOptions::CreateRecordReaderOptions("ZLIB"));
+}
+
 // Tests of all the error paths in log_reader.cc follow:
-static void AssertHasSubstr(StringPiece s, StringPiece expected) {
+void AssertHasSubstr(StringPiece s, StringPiece expected) {
   EXPECT_TRUE(str_util::StrContains(s, expected))
       << s << " does not contain " << expected;
 }
 
+void TestReadError(const RecordWriterOptions& writer_options,
+                   const RecordReaderOptions& reader_options) {
+  const string wrote = BigString("well hello there!", 100);
+  string contents;
+  StringDest dst(&contents);
+  TF_ASSERT_OK(RecordWriter(&dst, writer_options).WriteRecord(wrote));
+
+  StringSource file(&contents);
+  RecordReader reader(&file, reader_options);
+
+  uint64 offset = 0;
+  string read;
+  file.force_error();
+  Status status = reader.ReadRecord(&offset, &read);
+  ASSERT_TRUE(errors::IsDataLoss(status));
+  ASSERT_EQ(0, offset);
+
+  // A failed Read() shouldn't update the offset, and thus a retry shouldn't
+  // lose the record.
+  status = reader.ReadRecord(&offset, &read);
+  ASSERT_TRUE(status.ok()) << status;
+  EXPECT_GT(offset, 0);
+  EXPECT_EQ(wrote, read);
+}
+
 TEST_F(RecordioTest, ReadError) {
-  Write("foo");
-  ForceError();
-  AssertHasSubstr(Read(), "Data loss");
+  TestReadError(RecordWriterOptions(), RecordReaderOptions());
+}
+
+TEST_F(RecordioTest, ReadErrorWithBuffering) {
+  RecordReaderOptions options;
+  options.buffer_size = 1 << 20;
+  TestReadError(RecordWriterOptions(), options);
+}
+
+TEST_F(RecordioTest, ReadErrorWithCompression) {
+  TestReadError(RecordWriterOptions::CreateRecordWriterOptions("ZLIB"),
+                RecordReaderOptions::CreateRecordReaderOptions("ZLIB"));
 }
 
 TEST_F(RecordioTest, CorruptLength) {
@@ -257,5 +340,6 @@ TEST_F(RecordioTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); }
 
 TEST_F(RecordioTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); }
 
+}  // namespace
 }  // namespace io
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/zlib_inputstream.cc b/tensorflow/core/lib/io/zlib_inputstream.cc
index 984fbc2810..47de36bf6c 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.cc
+++ b/tensorflow/core/lib/io/zlib_inputstream.cc
@@ -25,8 +25,9 @@ ZlibInputStream::ZlibInputStream(
     InputStreamInterface* input_stream,
     size_t input_buffer_bytes,   // size of z_stream.next_in buffer
     size_t output_buffer_bytes,  // size of z_stream.next_out buffer
-    const ZlibCompressionOptions& zlib_options)
-    : input_stream_(input_stream),
+    const ZlibCompressionOptions& zlib_options, bool owns_input_stream)
+    : owns_input_stream_(owns_input_stream),
+      input_stream_(input_stream),
       input_buffer_capacity_(input_buffer_bytes),
       output_buffer_capacity_(output_buffer_bytes),
       z_stream_input_(new Bytef[input_buffer_capacity_]),
@@ -37,14 +38,25 @@ ZlibInputStream::ZlibInputStream(
   InitZlibBuffer();
 }
 
+ZlibInputStream::ZlibInputStream(InputStreamInterface* input_stream,
+                                 size_t input_buffer_bytes,
+                                 size_t output_buffer_bytes,
+                                 const ZlibCompressionOptions& zlib_options)
+    : ZlibInputStream(input_stream, input_buffer_bytes, output_buffer_bytes,
+                      zlib_options, false) {}
+
 ZlibInputStream::~ZlibInputStream() {
   if (z_stream_) {
     inflateEnd(z_stream_.get());
   }
+  if (owns_input_stream_) {
+    delete input_stream_;
+  }
 }
 
 Status ZlibInputStream::Reset() {
   TF_RETURN_IF_ERROR(input_stream_->Reset());
+  inflateEnd(z_stream_.get());
   InitZlibBuffer();
   bytes_read_ = 0;
   return Status::OK();
diff --git a/tensorflow/core/lib/io/zlib_inputstream.h b/tensorflow/core/lib/io/zlib_inputstream.h
index 9c7e14441c..37339163ee 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.h
+++ b/tensorflow/core/lib/io/zlib_inputstream.h
@@ -40,7 +40,15 @@ class ZlibInputStream : public InputStreamInterface {
   // Create a ZlibInputStream for `input_stream` with a buffer of size
   // `input_buffer_bytes` bytes for reading contents from `input_stream` and
   // another buffer with size `output_buffer_bytes` for caching decompressed
-  // contents. Does *not* take ownership of "input_stream".
+  // contents.
+  //
+  // Takes ownership of `input_stream` iff `owns_input_stream` is true.
+  ZlibInputStream(InputStreamInterface* input_stream, size_t input_buffer_bytes,
+                  size_t output_buffer_bytes,
+                  const ZlibCompressionOptions& zlib_options,
+                  bool owns_input_stream);
+
+  // Equivalent to the previous constructor with owns_input_stream=false.
   ZlibInputStream(InputStreamInterface* input_stream, size_t input_buffer_bytes,
                   size_t output_buffer_bytes,
                   const ZlibCompressionOptions& zlib_options);
@@ -65,10 +73,11 @@ class ZlibInputStream : public InputStreamInterface {
  private:
   void InitZlibBuffer();
 
-  InputStreamInterface* input_stream_;  // Not owned
-  size_t input_buffer_capacity_;        // Size of z_stream_input_
-  size_t output_buffer_capacity_;       // Size of z_stream_output_
-  char* next_unread_byte_;              // Next unread byte in z_stream_output_
+  const bool owns_input_stream_;
+  InputStreamInterface* input_stream_;
+  size_t input_buffer_capacity_;   // Size of z_stream_input_
+  size_t output_buffer_capacity_;  // Size of z_stream_output_
+  char* next_unread_byte_;         // Next unread byte in z_stream_output_
 
   // Buffer for storing contents read from compressed stream.
   // TODO(srbs): Consider using circular buffers. That would greatly simplify
-- 
GitLab


From d2fd0bbac6368a6b41e73d18c93b24442f5653f1 Mon Sep 17 00:00:00 2001
From: Dimitris Vardoulakis <dimvar@google.com>
Date: Thu, 19 Apr 2018 23:35:04 -0700
Subject: [PATCH 216/450] [TF:XLA] Factor out the handling of while
 instructions to make HloVerifier::Run shorter.

PiperOrigin-RevId: 193626864
---
 .../compiler/xla/service/hlo_verifier.cc      | 83 +++++++++++--------
 .../compiler/xla/service/hlo_verifier.h       |  8 +-
 2 files changed, 55 insertions(+), 36 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 8c875698eb..80ed6d6832 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -731,6 +731,55 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const {
   return tensorflow::Status::OK();
 }
 
+Status HloVerifier::CheckWhileInstruction(HloInstruction* instruction) {
+  auto* while_cond = instruction->while_condition();
+  auto* while_body = instruction->while_body();
+  if (while_cond->num_parameters() != 1) {
+    return FailedPrecondition(
+        "While condition must have exactly 1 parameter; had %lld : %s",
+        while_cond->num_parameters(), while_cond->ToString().c_str());
+  }
+  if (while_body->num_parameters() != 1) {
+    return FailedPrecondition(
+        "While body must have exactly 1 parameter; had %lld : %s",
+        while_body->num_parameters(), while_body->ToString().c_str());
+  }
+  if (instruction->operand_count() != 1) {
+    return FailedPrecondition(
+        "While loop must have exactly one operand; had %lld : %s",
+        instruction->operand_count(), instruction->ToString().c_str());
+  }
+  auto* init = instruction->operand(0);
+  auto* cond_param = while_cond->parameter_instruction(0);
+  if (!ShapeUtil::Compatible(init->shape(), cond_param->shape())) {
+    return FailedPrecondition(
+        "While condition's parameter must have the same shape as the "
+        "loop's 'init'. init: %s, param: %s",
+        init->ToString().c_str(), cond_param->ToString().c_str());
+  }
+  auto* cond_root = while_cond->root_instruction();
+  if (!ShapeUtil::Compatible(cond_root->shape(),
+                             ShapeUtil::MakeShape(PRED, {}))) {
+    return FailedPrecondition("While condition should have shape PRED: %s",
+                              cond_root->ToString().c_str());
+  }
+  auto* body_param = while_body->parameter_instruction(0);
+  if (!ShapeUtil::Compatible(init->shape(), body_param->shape())) {
+    return FailedPrecondition(
+        "While body's parameter must have the same shape as the loop's"
+        " 'init'. init: %s, param: %s",
+        init->ToString().c_str(), body_param->ToString().c_str());
+  }
+  auto* body_root = while_body->root_instruction();
+  if (!ShapeUtil::Compatible(init->shape(), body_root->shape())) {
+    return FailedPrecondition(
+        "While body should have same shape as the loop's 'init'."
+        "init: %s, body: %s",
+        init->ToString().c_str(), body_root->ToString().c_str());
+  }
+  return tensorflow::Status::OK();
+}
+
 StatusOr<bool> HloVerifier::Run(HloModule* module) {
   TF_RETURN_IF_ERROR(VerifyHloStructure(module));
 
@@ -771,39 +820,7 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
             << instruction->dimensions().size()
             << " != " << ShapeUtil::Rank(instruction->operand(0)->shape());
       } else if (instruction->opcode() == HloOpcode::kWhile) {
-        auto* while_cond = instruction->while_condition();
-        auto* while_body = instruction->while_body();
-        TF_RET_CHECK(while_cond->num_parameters() == 1)
-            << "While condition must have exactly 1 parameter; had "
-            << while_cond->num_parameters() << ": " << while_cond->ToString();
-        TF_RET_CHECK(while_body->num_parameters() == 1)
-            << "While body must have exactly 1 parameter; had "
-            << while_body->num_parameters() << ": " << while_body->ToString();
-        TF_RET_CHECK(instruction->operand_count() == 1)
-            << "While loop must have exactly one operand; had "
-            << instruction->operand_count() << ": " << instruction->ToString();
-
-        auto* init = instruction->operand(0);
-        auto* cond_param = while_cond->parameter_instruction(0);
-        TF_RET_CHECK(ShapeUtil::Compatible(init->shape(), cond_param->shape()))
-            << "While condition's parameter must have the same shape as the "
-               "loop's 'init'. init: "
-            << init->ToString() << ", param: " << cond_param->ToString();
-        auto* cond_root = while_cond->root_instruction();
-        TF_RET_CHECK(ShapeUtil::Compatible(cond_root->shape(),
-                                           ShapeUtil::MakeShape(PRED, {})))
-            << "While condition should have shape PRED: "
-            << cond_root->ToString();
-
-        auto* body_param = while_body->parameter_instruction(0);
-        TF_RET_CHECK(ShapeUtil::Compatible(init->shape(), body_param->shape()))
-            << "While body's parameter must have the same shape as the loop's "
-               "'init'. init: "
-            << init->ToString() << ", param: " << body_param->ToString();
-        auto* body_root = while_body->root_instruction();
-        TF_RET_CHECK(ShapeUtil::Compatible(init->shape(), body_root->shape()))
-            << "While body should have same shape as the loop's 'init'. init: "
-            << init->ToString() << ", body: " << body_root->ToString();
+        TF_RETURN_IF_ERROR(CheckWhileInstruction(instruction));
       }
 
       auto previous = instructions.find(instruction->name());
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 1dd7ec3c51..1ec55a9bdc 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -102,7 +102,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status CheckTernaryShape(const HloInstruction* instruction);
   Status CheckVariadicShape(const HloInstruction* instruction);
 
-  // Checks if the given two instructions shares the same channel id.
+  // Checks if the given two instructions share the same channel id.
   Status CheckSameChannel(const HloInstruction* instr1,
                           const HloInstruction* instr2);
 
@@ -144,9 +144,11 @@ class HloVerifier : public HloPassInterface {
   // CHECKs various invariants of a fusion instruction.
   Status CheckFusionInstruction(HloInstruction* fusion) const;
 
+  Status CheckWhileInstruction(HloInstruction* instruction);
+
   // Creates a ShapeVerifier that checks that shapes match inferred
-  // expectations.  This is a factory function because ShapeVerifier,  Note that
-  // ShapeVerifier, being a DfsHloVisitor, is stateful.  We want a clean object
+  // expectations. This is a factory function because ShapeVerifier,
+  // being a DfsHloVisitor, is stateful. We want a clean object
   // for each run of the verifier.
   ShapeVerifierFactory shape_verifier_factory_;
 };
-- 
GitLab


From 9e0037513040fd09ee01442bd062936b41bee40c Mon Sep 17 00:00:00 2001
From: SukHwan Kim <30820468+jerry4897@users.noreply.github.com>
Date: Fri, 20 Apr 2018 18:24:52 +0900
Subject: [PATCH 217/450] Update c_api_test.cc

Typo
---
 tensorflow/c/c_api_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index ca80db23ed..9b86425aa5 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -1700,7 +1700,7 @@ TEST_F(CApiGradientsTest, OpWithNoGradientRegistered_NoGradInputs) {
   TestGradientsError(false);
 }
 
-// REGISTER_OP for CApiTestAttributesTest test cases.
+// REGISTER_OP for CApiAttributesTest test cases.
 // Registers two ops, each with a single attribute called 'v'.
 // The attribute in one op will have a type 'type', the other
 // will have list(type).
-- 
GitLab


From 1ad32703d4e728d8fba835aaf24418f19cf85dbe Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Fri, 20 Apr 2018 03:29:31 -0700
Subject: [PATCH 218/450] [TF:XLA] Implement ClipByValue.

PiperOrigin-RevId: 193646890
---
 tensorflow/compiler/tests/ternary_ops_test.py | 18 ++++++
 tensorflow/compiler/tf2xla/kernels/BUILD      |  1 +
 .../tf2xla/kernels/clip_by_value_op.cc        | 61 +++++++++++++++++++
 3 files changed, 80 insertions(+)
 create mode 100644 tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc

diff --git a/tensorflow/compiler/tests/ternary_ops_test.py b/tensorflow/compiler/tests/ternary_ops_test.py
index ba5f829936..75a2cf07c5 100644
--- a/tensorflow/compiler/tests/ternary_ops_test.py
+++ b/tensorflow/compiler/tests/ternary_ops_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.compiler.tests.xla_test import XLATestCase
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
 
@@ -119,6 +120,23 @@ class TernaryOpsTest(XLATestCase):
           np.array([2, 1], dtype=np.int32),
           expected=np.array([[2], [5]], dtype=dtype))
 
+  def testClipByValue(self):
+    # TODO(b/78258593): enable integer types here too.
+    for dtype in self.float_types:
+      test_cases = [
+          (np.array([2, 4, 5], dtype=dtype), dtype(7)),  #
+          (dtype(1), np.array([2, 4, 5], dtype=dtype)),  #
+          (np.array([-2, 7, 7], dtype=dtype), np.array([-2, 9, 8], dtype=dtype))
+      ]
+      x = np.array([-2, 10, 6], dtype=dtype)
+      for lower, upper in test_cases:
+        self._testTernary(
+            gen_math_ops._clip_by_value,
+            x,
+            lower,
+            upper,
+            expected=np.minimum(np.maximum(x, lower), upper))
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 579b669699..00fd08b1a0 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -21,6 +21,7 @@ tf_kernel_library(
         "cast_op.cc",
         "categorical_op.cc",
         "cholesky_op.cc",
+        "clip_by_value_op.cc",
         "concat_op.cc",
         "const_op.cc",
         "conv_ops.cc",
diff --git a/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc b/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc
new file mode 100644
index 0000000000..fdf75be7b1
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace {
+
+class ClipByValueOp : public XlaOpKernel {
+ public:
+  explicit ClipByValueOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape shape = ctx->InputShape(0);
+    const TensorShape min_shape = ctx->InputShape(1);
+    const TensorShape max_shape = ctx->InputShape(2);
+
+    xla::ComputationBuilder* builder = ctx->builder();
+    auto input = ctx->Input(0);
+    auto min = ctx->Input(1);
+    auto max = ctx->Input(2);
+
+    auto shape_error = [&]() -> tensorflow::Status {
+      return errors::InvalidArgument(
+          "clip_value_min and clip_value_max must be either of "
+          "the same shape as input, or a scalar. ",
+          "Input shape: ", shape.DebugString(),
+          " clip_value_min shape: ", min_shape.DebugString(),
+          " clip_value_max shape: ", max_shape.DebugString());
+    };
+
+    if (shape != min_shape) {
+      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(min_shape), shape_error());
+      min = builder->Broadcast(min, shape.dim_sizes());
+    }
+    if (shape != max_shape) {
+      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(max_shape), shape_error());
+      max = builder->Broadcast(max, shape.dim_sizes());
+    }
+    ctx->SetOutput(0, builder->Clamp(min, input, max));
+  }
+};
+
+REGISTER_XLA_OP(Name("ClipByValue"), ClipByValueOp);
+
+}  // namespace
+}  // namespace tensorflow
-- 
GitLab


From 0c03255aa5f4b37de97e0685ffa15888fc16e4b3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 06:36:56 -0700
Subject: [PATCH 219/450] internal change

PiperOrigin-RevId: 193659701
---
 .../lite/toco/graph_transformations/propagate_fixed_sizes.cc   | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index b34aca1f09..ba244cf5ef 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1516,10 +1516,7 @@ void ProcessArgMaxOperator(Model* model, ArgMaxOperator* op) {
     return;
   }
 
-  // The current ArgMax implementation only supports 4-dimensional inputs with
-  // the last dimension as the axis to perform ArgMax for.
   const std::vector<int>& input_dims = input_array.shape().dims();
-  CHECK_EQ(input_dims.size(), 4);
   std::vector<int> output_dims;
 
   output_dims.reserve(input_dims.size() - 1);
-- 
GitLab


From c212d5542bb666b613a8567338983288a3ab15f4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 08:08:01 -0700
Subject: [PATCH 220/450] Eliminate the guard around Winograd non-fused
 convolutions with cudnn7.

PiperOrigin-RevId: 193669636
---
 .../fused_conv2d_bias_activation_op.cc        |  3 +-
 .../core/kernels/conv_grad_filter_ops.cc      |  3 +-
 .../core/kernels/conv_grad_input_ops.cc       |  3 +-
 tensorflow/core/kernels/conv_grad_ops_3d.cc   |  8 +++--
 tensorflow/core/kernels/conv_ops.cc           |  3 +-
 tensorflow/core/kernels/conv_ops_3d.cc        |  4 ++-
 tensorflow/core/kernels/conv_ops_gpu.h        | 35 +++++++++++++------
 tensorflow/core/kernels/conv_ops_test.cc      | 26 +++++++++-----
 8 files changed, 59 insertions(+), 26 deletions(-)

diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index 0e06575d96..1e8f011b5d 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -543,7 +543,8 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
                                 fused_conv_parameters, &algorithm_config)) {
     std::vector<dnn::AlgorithmDesc> algorithms;
     CHECK(stream->parent()->GetConvolveAlgorithms(
-        fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(),
+        fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(
+            stream->parent()),
         &algorithms));
     dnn::ProfileResult best_result;
     dnn::ProfileResult best_result_no_scratch;
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 66ee474ca3..f3b91494b9 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -912,7 +912,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
                                 conv_parameters, &algorithm_config)) {
     std::vector<AlgorithmDesc> algorithms;
     CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms(
-        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(stream->parent()),
+        &algorithms));
     ProfileResult best_result;
     ProfileResult best_result_no_scratch;
     for (auto profile_algorithm : algorithms) {
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 71ea0d5d72..66d15c6e78 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -961,7 +961,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
                                 conv_parameters, &algorithm_config)) {
     std::vector<AlgorithmDesc> algorithms;
     CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(
-        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(stream->parent()),
+        &algorithms));
     ProfileResult best_result;
     ProfileResult best_result_no_scratch;
     for (auto profile_algorithm : algorithms) {
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 3650ab53b2..1234997bc5 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -662,7 +662,9 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
                                    conv_parameters, &algorithm_config)) {
       std::vector<AlgorithmDesc> algorithms;
       CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(
-          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(
+              stream->parent()),
+          &algorithms));
       ProfileResult best_result;
       ProfileResult best_result_no_scratch;
       for (auto profile_algorithm : algorithms) {
@@ -1029,7 +1031,9 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
                                    conv_parameters, &algorithm_config)) {
       std::vector<AlgorithmDesc> algorithms;
       CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms(
-          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(
+              stream->parent()),
+          &algorithms));
       ProfileResult best_result;
       ProfileResult best_result_no_scratch;
       for (auto profile_algorithm : algorithms) {
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 88843e4da7..f0888c655f 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -710,7 +710,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       !AutoTuneConv::GetInstance()->Find(conv_parameters, &algorithm_config)) {
     std::vector<AlgorithmDesc> algorithms;
     CHECK(stream->parent()->GetConvolveAlgorithms(
-        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(stream->parent()),
+        &algorithms));
     ProfileResult best_result;
     ProfileResult best_result_no_scratch;
     for (auto profile_algorithm : algorithms) {
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 21c84b2a0e..0b7c1524e6 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -396,7 +396,9 @@ struct LaunchConvOp<GPUDevice, T> {
                                   conv_parameters, &algorithm_config)) {
       std::vector<AlgorithmDesc> algorithms;
       CHECK(stream->parent()->GetConvolveAlgorithms(
-          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(
+              stream->parent()),
+          &algorithms));
       ProfileResult best_result;
       ProfileResult best_result_no_scratch;
       for (auto profile_algorithm : algorithms) {
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index f0085be3a5..7f9cfec981 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -137,20 +137,18 @@ class ConvParameters {
     // clang-format on
   }
 
-  // TODO(yangzihao): The purpose of this function is to disable winograd
-  // nonfused conv algorithm for certain input parameters so as to avoid a bug
-  // in cuDNNv5 and cuDNNv6. Remove this once switch to cuDNNv7.
+  // The purpose of this function is to disable winograd nonfused conv algorithm
+  // for certain input parameters so as to avoid a bug in cuDNNv5 and cuDNNv6.
   template <typename T>
-  bool ShouldIncludeWinogradNonfusedAlgo() const {
-    int64 total_size = 16 * std::ceil(batch_ / 16.0) *
-                       std::max(in_depths_, out_depths_) * in_[0] * in_[1] *
-                       sizeof(T);
-    int64 threshold = 1LL << 31;
-    if (total_size >= threshold) {
-      return false;
-    } else {
+  bool ShouldIncludeWinogradNonfusedAlgo(
+      perftools::gputools::StreamExecutor* stream_exec) const {
+    // Skip this check for cuDNN 7 and newer.
+    perftools::gputools::port::StatusOr<std::tuple<int, int, int>> version =
+        stream_exec->AsDnn()->GetVersion();
+    if (version.ok() && std::get<0>(version.ValueOrDie()) >= 7) {
       return true;
     }
+    return ShouldIncludeWinogradNonfusedAlgoPreCudnn7<T>();
   }
 
  protected:
@@ -166,6 +164,21 @@ class ConvParameters {
   uint64 hash_code_;
 
  private:
+  friend struct ConvParametersPeer;  // For testing purposes.
+
+  template <typename T>
+  bool ShouldIncludeWinogradNonfusedAlgoPreCudnn7() const {
+    int64 total_size = 16 * std::ceil(batch_ / 16.0) *
+                       std::max(in_depths_, out_depths_) * in_[0] * in_[1] *
+                       sizeof(T);
+    int64 threshold = 1LL << 31;
+    if (total_size >= threshold) {
+      return false;
+    } else {
+      return true;
+    }
+  }
+
   int64 batch_;
   int64 in_depths_;
   int64 out_depths_;
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index e2e166c02f..8afe6a2cbd 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -22,20 +22,28 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
 
-#include "tensorflow/core/kernels/conv_ops_gpu.h"
-
 namespace tensorflow {
 
 #if GOOGLE_CUDA
 
+struct ConvParametersPeer {
+  template <typename T>
+  bool ShouldIncludeWinogradNonfusedAlgoPreCudnn7() {
+    return params.ShouldIncludeWinogradNonfusedAlgoPreCudnn7<T>();
+  }
+
+  ConvParameters params;
+};
+
 TEST(ConvParameters, WinogradNonfusedAlgoSize) {
-  ConvParameters conv_params_small = {
+  ConvParametersPeer conv_params_small = {{
       1,         // batch
       32,        // in_depths
       {{300,     // in_rows
@@ -51,10 +59,11 @@ TEST(ConvParameters, WinogradNonfusedAlgoSize) {
         0}},     // padding_cols
       DT_FLOAT,  // tensor datatype
       0,         // device_id
-  };
-  EXPECT_TRUE(conv_params_small.ShouldIncludeWinogradNonfusedAlgo<float>());
+  }};
+  EXPECT_TRUE(
+      conv_params_small.ShouldIncludeWinogradNonfusedAlgoPreCudnn7<float>());
 
-  ConvParameters conv_params_large = {
+  ConvParametersPeer conv_params_large = {{
       1,         // batch
       128,       // in_depths
       {{300,     // in_rows
@@ -70,8 +79,9 @@ TEST(ConvParameters, WinogradNonfusedAlgoSize) {
         0}},     // padding_cols
       DT_FLOAT,  // tensor datatype
       0,         // device_id
-  };
-  EXPECT_FALSE(conv_params_large.ShouldIncludeWinogradNonfusedAlgo<float>());
+  }};
+  EXPECT_FALSE(
+      conv_params_large.ShouldIncludeWinogradNonfusedAlgoPreCudnn7<float>());
 }
 
 #endif  // GOOGLE_CUDA
-- 
GitLab


From 3e20fee5810796f70713122d235176b9c022ef41 Mon Sep 17 00:00:00 2001
From: Junpeng Lao <junpeng.lao@unifr.ch>
Date: Fri, 20 Apr 2018 18:05:52 +0200
Subject: [PATCH 221/450] Address comments from @srvasude

---
 .../kernel_tests/bijectors/ordered_test.py    | 32 +++++++++++--------
 .../python/ops/bijectors/ordered.py           | 21 ++++++++----
 2 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
index 63c8f1fb31..721dba9c3a 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,33 +23,36 @@ import numpy as np
 from tensorflow.contrib.distributions.python.ops.bijectors.ordered import Ordered
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
 from tensorflow.python.platform import test
 
 
-rng = np.random.RandomState(42)
-
 
 class OrderedBijectorTest(test.TestCase):
   """Tests correctness of the ordered transformation."""
 
+  def setUp(self):
+      self._rng = np.random.RandomState(42)
+
+  @test_util.run_in_graph_and_eager_modes()
   def testBijectorVector(self):
     with self.test_session():
       ordered = Ordered()
       self.assertEqual("ordered", ordered.name)
       x = np.asarray([[2., 3, 4], [4., 8, 13]])
       y = [[2., 0, 0], [4., np.log(4.), np.log(5.)]]
-      self.assertAllClose(y, ordered.forward(x).eval())
-      self.assertAllClose(x, ordered.inverse(y).eval())
+      self.assertAllClose(y, self.evaluate(ordered.forward(x)))
+      self.assertAllClose(x, self.evaluate(ordered.inverse(y)))
       self.assertAllClose(
           np.sum(np.asarray(y)[..., 1:], axis=-1),
-          ordered.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          self.evaluate(ordered.inverse_log_det_jacobian(y, event_ndims=1)),
           atol=0.,
           rtol=1e-7)
       self.assertAllClose(
-          -ordered.inverse_log_det_jacobian(y, event_ndims=1).eval(),
-          ordered.forward_log_det_jacobian(x, event_ndims=1).eval(),
+          self.evaluate(-ordered.inverse_log_det_jacobian(y, event_ndims=1)),
+          self.evaluate(ordered.forward_log_det_jacobian(x, event_ndims=1)),
           atol=0.,
           rtol=1e-7)
 
@@ -79,6 +82,7 @@ class OrderedBijectorTest(test.TestCase):
           atol=0.,
           rtol=1e-7)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testShapeGetters(self):
     with self.test_session():
       x = tensor_shape.TensorShape([4])
@@ -86,18 +90,18 @@ class OrderedBijectorTest(test.TestCase):
       bijector = Ordered(validate_args=True)
       self.assertAllEqual(y, bijector.forward_event_shape(x))
       self.assertAllEqual(y.as_list(),
-                          bijector.forward_event_shape_tensor(
-                              x.as_list()).eval())
+                          self.evaluate(bijector.forward_event_shape_tensor(
+                              x.as_list())))
       self.assertAllEqual(x, bijector.inverse_event_shape(y))
       self.assertAllEqual(x.as_list(),
-                          bijector.inverse_event_shape_tensor(
-                              y.as_list()).eval())
+                          self.evaluate(bijector.inverse_event_shape_tensor(
+                              y.as_list())))
 
   def testBijectiveAndFinite(self):
     with self.test_session():
       ordered = Ordered()
-      x = np.sort(rng.randn(3, 10), axis=-1).astype(np.float32)
-      y = (rng.randn(3, 10)).astype(np.float32)
+      x = np.sort(self._rng.randn(3, 10), axis=-1).astype(np.float32)
+      y = (self._rng.randn(3, 10)).astype(np.float32)
       assert_bijective_and_finite(ordered, x, y, event_ndims=1)
 
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
index b2959cce31..46fec0562c 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
@@ -36,6 +36,8 @@ class Ordered(bijector.Bijector):
   """Bijector which maps a tensor x_k that has increasing elements in the last
   dimension to an unconstrained tensor y_k.
 
+  Both the domain and the codomain of the mapping is [-inf, inf], however,
+  the input of the forward mapping must be strictly increasing.
   The inverse of the bijector applied to a normal random vector `y ~ N(0, 1)`
   gives back a sorted random vector with the same distribution `x ~ N(0, 1)`
   where `x = sort(y)`
@@ -55,11 +57,7 @@ class Ordered(bijector.Bijector):
   ```
   """
 
-  def __init__(self,
-               validate_args=False,
-               name="ordered"):
-    self._graph_parents = []
-    self._name = name
+  def __init__(self, validate_args=False, name="ordered"):
     super(Ordered, self).__init__(
         forward_min_event_ndims=1,
         validate_args=validate_args,
@@ -90,21 +88,30 @@ class Ordered(bijector.Bijector):
 
   def _forward(self, x):
     x = self._maybe_assert_valid_x(x)
-    y0 = array_ops.expand_dims(x[..., 0], -1)
+    y0 = x[..., 0, array_ops.newaxis]
     yk = math_ops.log(x[..., 1:] - x[..., :-1])
     y = array_ops.concat([y0, yk], axis=-1)
     return y
 
   def _inverse(self, y):
-    x0 = array_ops.expand_dims(y[..., 0], -1)
+    x0 = y[..., 0, array_ops.newaxis]
     xk = math_ops.exp(y[..., 1:])
     x = array_ops.concat([x0, xk], axis=-1)
     return math_ops.cumsum(x, axis=-1)
 
   def _inverse_log_det_jacobian(self, y):
+    # The Jacobian of the inverse mapping is lower
+    # triangular, with the diagonal elements being:
+    # J[i,i] = 1 if i=1, and
+    #          exp(y_i) if 1<i<=K
+    # which gives the absolute Jacobian determinant:
+    # |det(Jac)| = prod_{i=1}^{K} exp(y[i]).
+    # (1) - Stan Modeling Language User’s Guide and Reference Manual
+    #       Version 2.17.0 session 35.2
     return math_ops.reduce_sum(y[..., 1:], axis=-1)
 
   def _forward_log_det_jacobian(self, x):
+    x = self._maybe_assert_valid_x(x)
     return -math_ops.reduce_sum(
       math_ops.log(x[..., 1:] - x[..., :-1]),
       axis=-1)
-- 
GitLab


From 814ab7e37dcbfa7f4749a1fd9d687d6be0207cb8 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Fri, 20 Apr 2018 09:20:36 -0700
Subject: [PATCH 222/450] [TF:XLA] Bump open source llvm revision to r330313

PiperOrigin-RevId: 193678317
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index d7bd2a2be0..aeaf8d7a24 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -451,11 +451,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/3210e64b499a31193051208f2f8922dadfc4bb6f.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/3210e64b499a31193051208f2f8922dadfc4bb6f.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/c1e9b6f826c86c87a7e7173f1baf7e7df9f43e32.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/c1e9b6f826c86c87a7e7173f1baf7e7df9f43e32.tar.gz",
       ],
-      sha256 = "017d7db029cc175634d75416c326770139c76590575ed44a3794c11ab160c955",
-      strip_prefix = "llvm-3210e64b499a31193051208f2f8922dadfc4bb6f",
+      sha256 = "92b7c01074f694a77b4d664951d1ec071e30ef19c61e673158e95fbb6e447b54",
+      strip_prefix = "llvm-c1e9b6f826c86c87a7e7173f1baf7e7df9f43e32",
       build_file = clean_dep("//third_party/llvm:llvm.BUILD"),
   )
 
-- 
GitLab


From d0e3e998376f5e7d59678e5d42f3497e52ca7622 Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Fri, 20 Apr 2018 09:23:52 -0700
Subject: [PATCH 223/450] Fix msan error in MapAndBatchDataset. While
 checkpointing tensors in BatchResult.output save only the initialized slice.
 If the final batch is short, the entire batch tensor may not be initialized.

PiperOrigin-RevId: 193678679
---
 .../kernels/data/map_and_batch_dataset_op.cc  | 44 +++++++++++++++----
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index b8105552a0..605ef3c0b7 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -331,7 +331,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         }
         CHECK_EQ(batch_results_.size(), batch_results_size);
         for (size_t i = 0; i < batch_results_size; ++i) {
-          TF_RETURN_IF_ERROR(ReadBatchResultLocked(reader, i));
+          TF_RETURN_IF_ERROR(ReadBatchResultLocked(ctx, reader, i));
         }
         return Status::OK();
       }
@@ -573,7 +573,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         // finish. This may delay saving a checkpoint by a bit but keeps the
         // code clean and also saves us from checkpointing the state of the
         // `BlockingCounter`.
-        batch_results_[index].counter->Wait();
+        int64 num_elements = 0;
+        WaitForBatch(index, &num_elements).IgnoreError();
+
         const BatchResult& result = batch_results_[index];
         string prefix = strings::StrCat("batch_results_", index);
         {
@@ -587,14 +589,24 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
             full_name(strings::StrCat(prefix, "_output_size")),
             result.output.size()));
         for (size_t i = 0; i < result.output.size(); i++) {
-          TF_RETURN_IF_ERROR(writer->WriteTensor(
-              full_name(strings::StrCat(prefix, "_output_", i)),
-              result.output[i]));
+          // If the batch is not full, we only store the first
+          // `num_elements` values. The rest of the batch tensor is
+          // *uninitialized* and accessing that will raise msan errors.
+          if (num_elements < dataset()->batch_size_) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat(prefix, "_output_", i)),
+                result.output[i].Slice(0, num_elements)));
+          } else {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat(prefix, "_output_", i)),
+                result.output[i]));
+          }
         }
         return Status::OK();
       }
 
-      Status ReadBatchResultLocked(IteratorStateReader* reader, size_t index)
+      Status ReadBatchResultLocked(IteratorContext* ctx,
+                                   IteratorStateReader* reader, size_t index)
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         BatchResult* result = &batch_results_[index];
         string prefix = strings::StrCat("batch_results_", index);
@@ -618,10 +630,24 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         }
         result->output.reserve(output_size);
         for (size_t i = 0; i < output_size; i++) {
-          result->output.emplace_back();
+          Tensor t;
           TF_RETURN_IF_ERROR(reader->ReadTensor(
-              full_name(strings::StrCat(prefix, "_output_", i)),
-              &result->output.back()));
+              full_name(strings::StrCat(prefix, "_output_", i)), &t));
+          // If the batch was not full, we may have stored only the relevant
+          // slice. Since tensors in `BatchResult.output` are expected to
+          // have the leading dimension of size batch_size, we build a larger
+          // tensor and copy the slice read from the checkpoint into it.
+          if (t.dim_size(0) < dataset()->batch_size_) {
+            TensorShape component_shape(t.shape());
+            component_shape.set_dim(0, dataset()->batch_size_);
+            AllocatorAttributes attr;
+            attr.set_gpu_compatible(true);
+            Tensor new_t(ctx->allocator(attr), t.dtype(), component_shape);
+            TF_RETURN_IF_ERROR(CopyPartialBatch(&new_t, t, t.dim_size(0)));
+            result->output.emplace_back(std::move(new_t));
+          } else {
+            result->output.emplace_back(std::move(t));
+          }
         }
         return Status::OK();
       }
-- 
GitLab


From cd462f39e58674a43d1f8c156f23235722b2281e Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Fri, 20 Apr 2018 09:31:08 -0700
Subject: [PATCH 224/450] Don't delete inbound_nodes and outbound_nodes, these
 no longer exist.

PiperOrigin-RevId: 193679512
---
 tensorflow/tools/docs/generate.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/tools/docs/generate.py b/tensorflow/tools/docs/generate.py
index c750539a76..fc93085e3e 100644
--- a/tensorflow/tools/docs/generate.py
+++ b/tensorflow/tools/docs/generate.py
@@ -43,10 +43,6 @@ if __name__ == '__main__':
 
   flags = doc_generator.parse_known_args()
 
-  # Suppress documentation of some symbols that users should never use.
-  del tf.layers.Layer.inbound_nodes
-  del tf.layers.Layer.outbound_nodes
-
   # tf_debug is not imported with tf, it's a separate module altogether
   doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)])
 
-- 
GitLab


From fb23c0e166179ccf372203982d8fe79de441e360 Mon Sep 17 00:00:00 2001
From: James Keeling <jtkeeling@google.com>
Date: Fri, 20 Apr 2018 09:54:50 -0700
Subject: [PATCH 225/450] Correct error in "Adding An Op" docs.

The macro `REGISTER_KERNEL_BUILDER` always declared a functor specialized on floats, instead of the type actually passed into the macro.

PiperOrigin-RevId: 193682519
---
 tensorflow/docs_src/extend/adding_an_op.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index 84da2165b5..c3795492ce 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -267,7 +267,7 @@ REGISTER_CPU(int32);
 #ifdef GOOGLE_CUDA
 #define REGISTER_GPU(T)                                          \
   /* Declare explicit instantiations in kernel_example.cu.cc. */ \
-  extern template ExampleFunctor<GPUDevice, float>;              \
+  extern template ExampleFunctor<GPUDevice, T>;                  \
   REGISTER_KERNEL_BUILDER(                                       \
       Name("Example").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
       ExampleOp<GPUDevice, T>);
-- 
GitLab


From a749a6b95932d6f7438a01a2f5fd661343ad536f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 10:16:03 -0700
Subject: [PATCH 226/450] Change the TF record reader to use 16MB buffering by
 default in order to improve performance.

PiperOrigin-RevId: 193685521
---
 tensorflow/python/lib/io/py_record_reader.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/lib/io/py_record_reader.cc b/tensorflow/python/lib/io/py_record_reader.cc
index 5fcb51b3b2..9500fc6a7c 100644
--- a/tensorflow/python/lib/io/py_record_reader.cc
+++ b/tensorflow/python/lib/io/py_record_reader.cc
@@ -43,9 +43,10 @@ PyRecordReader* PyRecordReader::New(const string& filename, uint64 start_offset,
   reader->offset_ = start_offset;
   reader->file_ = file.release();
 
+  static const uint64 kReaderBufferSize = 16 * 1024 * 1024;
   RecordReaderOptions options =
       RecordReaderOptions::CreateRecordReaderOptions(compression_type_string);
-
+  options.buffer_size = kReaderBufferSize;
   reader->reader_ = new RecordReader(reader->file_, options);
   return reader;
 }
-- 
GitLab


From 729192823935156ae29d7f0d5f64c0bcd6034c7a Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Fri, 20 Apr 2018 10:32:24 -0700
Subject: [PATCH 227/450] Adding Shape inference functions to outfeed enqueue
 ops.

PiperOrigin-RevId: 193688099
---
 tensorflow/contrib/tpu/ops/outfeed_ops.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/contrib/tpu/ops/outfeed_ops.cc b/tensorflow/contrib/tpu/ops/outfeed_ops.cc
index 5900c61a38..b05c76ca64 100644
--- a/tensorflow/contrib/tpu/ops/outfeed_ops.cc
+++ b/tensorflow/contrib/tpu/ops/outfeed_ops.cc
@@ -26,6 +26,7 @@ REGISTER_OP("OutfeedEnqueue")
     .Input("input: dtype")
     .Attr("dtype: type")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::NoOutputs)
     .Doc(R"doc(
 An op which emits a single Tensor value from an XLA computation.
 
@@ -36,6 +37,7 @@ REGISTER_OP("OutfeedEnqueueTuple")
     .Input("inputs: dtypes")
     .Attr("dtypes: list(type)")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::NoOutputs)
     .Doc(R"doc(
 An op which emits multiple Tensor values from an XLA computation.
 
-- 
GitLab


From da5a6d86b856001c03cccace5ac74fa8f045b6ae Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 10:34:49 -0700
Subject: [PATCH 228/450] Disable constant folding and arithmetic optimizations
 for functions.

PiperOrigin-RevId: 193688466
---
 tensorflow/core/grappler/optimizers/meta_optimizer.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 22799311bc..cdc4698c34 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -243,6 +243,10 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   std::unordered_set<string> optimized_funcs;
   bool optimize_function_library = true;
 
+  // TODO(ezhulenev): turn it on after fixing ranklab: tune_tf_test.
+  cfg_.set_constant_folding(RewriterConfig::OFF);
+  cfg_.set_arithmetic_optimization(RewriterConfig::OFF);
+
   while (optimize_function_library) {
     optimize_function_library = false;
 
-- 
GitLab


From a09c02a3ecc190da8fbae88bdc54505de5387645 Mon Sep 17 00:00:00 2001
From: Junpeng Lao <junpeng.lao@unifr.ch>
Date: Fri, 20 Apr 2018 20:06:02 +0200
Subject: [PATCH 229/450] minor code styling

---
 .../contrib/distributions/python/ops/bijectors/ordered.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
index 46fec0562c..a180f1df0c 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
@@ -46,7 +46,7 @@ class Ordered(bijector.Bijector):
   `y[0] = x[0]`
   `y[1:] = math_ops.log(x[1:] - x[:-1])`
 
-  Example Use:
+  #### Example Use:
 
   ```python
   bijector.Ordered().forward([2, 3, 4])
-- 
GitLab


From b3f379e907259aa166c1ef734ccfd03331eb0a94 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Fri, 20 Apr 2018 11:10:56 -0700
Subject: [PATCH 230/450] [XLA:CPU] Use Eigen for F64 dot operations

PiperOrigin-RevId: 193694613
---
 tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc | 3 ++-
 tensorflow/compiler/xla/service/cpu/ir_emitter.cc     | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 29afd8ea5f..495fecc4aa 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -1070,7 +1070,8 @@ static bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
   // 1) be matrices with no padding, and
   // 2) have an allowed element type.
   PrimitiveType output_primitive_type = output_shape.element_type();
-  return (output_primitive_type == F32 || output_primitive_type == F16) &&
+  return (output_primitive_type == F64 || output_primitive_type == F32 ||
+          output_primitive_type == F16) &&
          IsRank2WithNoPadding(lhs_shape) && IsRank2WithNoPadding(rhs_shape) &&
          IsRank2WithNoPadding(output_shape);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 3405277d44..f990ee2785 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -2076,7 +2076,7 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
 
     TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
         /*instruction=*/*root, /*operands=*/{lhs, rhs},
-        /*supported_types=*/{F16, F32}));
+        /*supported_types=*/{F16, F32, F64}));
 
     llvm_ir::IrArray lhs_array(GetIrArrayFor(lhs));
     llvm_ir::IrArray rhs_array(GetIrArrayFor(rhs));
-- 
GitLab


From 49f3469d9533cb12d06ed3907b4ced975e2fcea4 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 20 Apr 2018 11:13:16 -0700
Subject: [PATCH 231/450] Use CreateWorkerSession and DeleteWorkerSession for
 all distributed sessions.

This change adds a phase to the session creation protocol: the master now contacts all workers to register a session handle and create a "WorkerSession" on each worker before it first registers or runs a graph on any worker. Subsequent requests to a worker ensure that the worker has the session handle registered before performing the request, and an AbortedError is raised if the worker has not (e.g. because it restarted after a failure).

As a result, more failure cases are covered by the high-level APIs (tf.estimator, Slim, etc.) that recreate the session on receiving an AbortedError. Previously, there was a possible race condition in which a PS task could restart between variable initialization and the first step, leading to a FailedPreconditionError ("Attempting to use uninitialized value") that would not be handled by the high-level APIs.

PiperOrigin-RevId: 193694958
---
 .../core/distributed_runtime/master_session.cc     | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index ebe350d313..1c67b42e76 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -89,6 +89,10 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   ~ReffedClientGraph() override {
     if (should_deregister_) {
       DeregisterPartitions();
+    } else {
+      for (Part& part : partitions_) {
+        worker_cache_->ReleaseWorker(part.name, part.worker);
+      }
     }
   }
 
@@ -1174,14 +1178,8 @@ Status MasterSession::Create(GraphDef* graph_def,
     TF_RETURN_IF_ERROR(GraphExecutionState::MakeForBaseGraph(
         graph_def, execution_options, &execution_state_));
   }
-  // TODO(b/36574172): Remove these conditions when ClusterSpec
-  // propagation is supported in all servers.
-  if (options.cluster_def != nullptr ||
-      session_opts_.config.isolate_session_state()) {
-    should_delete_worker_sessions_ = true;
-    return CreateWorkerSessions(options);
-  }
-  return Status::OK();
+  should_delete_worker_sessions_ = true;
+  return CreateWorkerSessions(options);
 }
 
 Status MasterSession::CreateWorkerSessions(
-- 
GitLab


From 570d90b9c7e6a19bc2606fdaf7ad0f85b8590c0e Mon Sep 17 00:00:00 2001
From: akindyakov <akindyakov@gmail.com>
Date: Fri, 20 Apr 2018 11:23:15 -0700
Subject: [PATCH 232/450] Speed up safe_strtod and safe_strtof functions by
 using double-conversion library Closes #12102.

PiperOrigin-RevId: 193696537
---
 tensorflow/contrib/cmake/CMakeLists.txt       |  4 +
 .../cmake/external/double_conversion.cmake    | 54 ++++++++++++
 tensorflow/contrib/makefile/Makefile          |  8 +-
 .../contrib/makefile/download_dependencies.sh |  4 +-
 tensorflow/core/BUILD                         |  9 +-
 tensorflow/core/lib/strings/numbers.cc        | 51 +++++++----
 tensorflow/core/lib/strings/numbers.h         |  2 +
 tensorflow/core/lib/strings/numbers_test.cc   | 87 +++++++++++++++++++
 tensorflow/core/lib/strings/str_util.cc       |  8 ++
 tensorflow/core/lib/strings/str_util.h        |  5 ++
 tensorflow/core/lib/strings/str_util_test.cc  | 56 ++----------
 tensorflow/tools/lib_package/BUILD            |  2 +
 tensorflow/tools/pip_package/BUILD            |  1 +
 tensorflow/workspace.bzl                      | 10 +++
 third_party/double_conversion.BUILD           | 38 ++++++++
 15 files changed, 270 insertions(+), 69 deletions(-)
 create mode 100644 tensorflow/contrib/cmake/external/double_conversion.cmake
 create mode 100644 third_party/double_conversion.BUILD

diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 23b31ae1dc..bdf3e98635 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -193,6 +193,7 @@ include(protobuf)
 include(re2)
 include(cub)
 include(sqlite)
+include(double_conversion)
 if (tensorflow_BUILD_CC_TESTS)
   include(googletest)
 endif()
@@ -213,6 +214,7 @@ set(tensorflow_EXTERNAL_LIBRARIES
     ${protobuf_STATIC_LIBRARIES}
     ${re2_STATIC_LIBRARIES}
     ${sqlite_STATIC_LIBRARIES}
+    ${double_conversion_STATIC_LIBRARIES}
 )
 
 if (systemlib_ZLIB)
@@ -240,6 +242,7 @@ set(tensorflow_EXTERNAL_DEPENDENCIES
     fft2d
     re2
     sqlite_copy_headers_to_destination
+    double_conversion
 )
 
 include_directories(
@@ -262,6 +265,7 @@ include_directories(
     ${PROTOBUF_INCLUDE_DIRS}
     ${re2_INCLUDE_DIR}
     ${sqlite_INCLUDE_DIR}
+    ${double_conversion_INCLUDE_DIR}
 )
 
 if(tensorflow_ENABLE_SSL_SUPPORT)
diff --git a/tensorflow/contrib/cmake/external/double_conversion.cmake b/tensorflow/contrib/cmake/external/double_conversion.cmake
new file mode 100644
index 0000000000..527ccdc8d8
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/double_conversion.cmake
@@ -0,0 +1,54 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+include (ExternalProject)
+
+set(double_conversion_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/double_conversion/src/double_conversion)
+set(double_conversion_URL https://github.com/google/double-conversion.git)
+set(double_conversion_TAG 5664746)
+set(double_conversion_BUILD ${double_conversion_INCLUDE_DIR})
+set(double_conversion_LIBRARIES ${double_conversion_BUILD}/double-conversion/libdouble-conversion.so)
+set(double_conversion_INCLUDES ${double_conversion_BUILD})
+
+if(WIN32)
+  set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/double-conversion/$(Configuration)/double-conversion.lib)
+else()
+  set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/double-conversion/libdouble-conversion.a)
+endif()
+
+set(double_conversion_HEADERS
+    "${double_conversion_INCLUDE_DIR}/double-conversion/bignum-dtoa.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/cached-powers.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/double-conversion.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/fixed-dtoa.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/strtod.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/bignum.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/diy-fp.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/fast-dtoa.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/ieee.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/utils.h"
+)
+
+ExternalProject_Add(double_conversion
+    PREFIX double_conversion
+    GIT_REPOSITORY ${double_conversion_URL}
+    GIT_TAG ${double_conversion_TAG}
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+    BUILD_IN_SOURCE 1
+    INSTALL_COMMAND ""
+    CMAKE_CACHE_ARGS
+        -DCMAKE_BUILD_TYPE:STRING=Release
+        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+)
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 05e8d9064b..1a1ab54a53 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -89,6 +89,7 @@ HOST_INCLUDES := \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
 -I$(MAKEFILE_DIR)/downloads/nsync/public \
 -I$(MAKEFILE_DIR)/downloads/fft2d \
+-I$(MAKEFILE_DIR)/downloads/double_conversion \
 -I$(HOST_GENDIR)
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
 	HOST_INCLUDES += -I$(MAKEFILE_DIR)/gen/protobuf-host/include
@@ -125,7 +126,9 @@ PROTO_TEXT := $(HOST_BINDIR)proto_text
 # The list of dependencies is derived from the Bazel build file by running
 # the gen_file_lists.sh script on a system with a working Bazel setup.
 PROTO_TEXT_CC_FILES := $(shell cat $(MAKEFILE_DIR)/proto_text_cc_files.txt)
-PROTO_TEXT_PB_CC_LIST := $(shell cat $(MAKEFILE_DIR)/proto_text_pb_cc_files.txt)
+PROTO_TEXT_PB_CC_LIST := \
+	$(shell cat $(MAKEFILE_DIR)/proto_text_pb_cc_files.txt) \
+	$(wildcard tensorflow/contrib/makefile/downloads/double_conversion/double-conversion/*.cc)
 PROTO_TEXT_PB_H_LIST := $(shell cat $(MAKEFILE_DIR)/proto_text_pb_h_files.txt)
 
 # Locations of the intermediate files proto_text generates.
@@ -171,6 +174,7 @@ INCLUDES := \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
 -I$(MAKEFILE_DIR)/downloads/nsync/public \
 -I$(MAKEFILE_DIR)/downloads/fft2d \
+-I$(MAKEFILE_DIR)/downloads/double_conversion \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
@@ -326,6 +330,7 @@ $(MARCH_OPTION) \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
 -I$(MAKEFILE_DIR)/downloads/nsync/public \
 -I$(MAKEFILE_DIR)/downloads/fft2d \
+-I$(MAKEFILE_DIR)/downloads/double_conversion \
 -I$(MAKEFILE_DIR)/gen/protobuf_android/$(ANDROID_ARCH)/include \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
@@ -603,6 +608,7 @@ $(wildcard tensorflow/core/platform/*/*.cc) \
 $(wildcard tensorflow/core/platform/*/*/*.cc) \
 $(wildcard tensorflow/core/util/*.cc) \
 $(wildcard tensorflow/core/util/*/*.cc) \
+$(wildcard tensorflow/contrib/makefile/downloads/double_conversion/double-conversion/*.cc) \
 tensorflow/core/util/version_info.cc
 # Remove duplicates (for version_info.cc)
 CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 8b415e6527..48953e2e38 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -32,7 +32,8 @@ GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.g
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 RE2_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
-FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
+FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
+DOUBLE_CONVERSION_URL="$(grep -o "https.*google/double-conversion.*\.zip" "${BZL_FILE_PATH}" | head -n1)"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
 CUB_URL="$(grep -o 'https.*cub/archive.*zip' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
 
@@ -87,6 +88,7 @@ download_and_extract "${NSYNC_URL}" "${DOWNLOADS_DIR}/nsync"
 download_and_extract "${PROTOBUF_URL}" "${DOWNLOADS_DIR}/protobuf"
 download_and_extract "${RE2_URL}" "${DOWNLOADS_DIR}/re2"
 download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d"
+download_and_extract "${DOUBLE_CONVERSION_URL}" "${DOWNLOADS_DIR}/double_conversion"
 download_and_extract "${ABSL_URL}" "${DOWNLOADS_DIR}/absl"
 download_and_extract "${CUB_URL}" "${DOWNLOADS_DIR}/cub/external/cub_archive"
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index c15e7de186..5b04574a4f 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -337,7 +337,9 @@ cc_library(
         "lib/bfloat16/bfloat16.h",
     ] + tf_additional_proto_hdrs() + glob(tf_env_time_hdrs()),
     copts = tf_copts(),
-    deps = tf_lib_proto_parsing_deps(),
+    deps = tf_lib_proto_parsing_deps() + [
+        "@double_conversion//:double-conversion",
+    ],
 )
 
 # This build rule (along with :lib_internal, :framework, and
@@ -1231,6 +1233,7 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
+        "@double_conversion//:double-conversion",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
@@ -1270,6 +1273,7 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
+        "@double_conversion//:double-conversion",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
@@ -1333,6 +1337,7 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
+        "@double_conversion//:double-conversion",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
@@ -1355,6 +1360,7 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
+        "@double_conversion//:double-conversion",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
@@ -1751,6 +1757,7 @@ cc_library(
         "//tensorflow/core/platform/default/build_config:platformlib",
         "@snappy",
         "@zlib_archive//:zlib",
+        "@double_conversion//:double-conversion",
         "@protobuf_archive//:protobuf",
     ] + tf_protos_all_impl() + tf_protos_grappler_impl(),
 )
diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index c296daa95d..e4b909296e 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include <locale>
 #include <unordered_map>
 
+#include "double-conversion/double-conversion.h"
+
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
@@ -110,6 +112,17 @@ T locale_independent_strtonum(const char* str, const char** endptr) {
   return result;
 }
 
+static inline const double_conversion::StringToDoubleConverter&
+StringToFloatConverter() {
+  static const double_conversion::StringToDoubleConverter converter(
+      double_conversion::StringToDoubleConverter::ALLOW_LEADING_SPACES |
+          double_conversion::StringToDoubleConverter::ALLOW_HEX |
+          double_conversion::StringToDoubleConverter::ALLOW_TRAILING_SPACES |
+          double_conversion::StringToDoubleConverter::ALLOW_CASE_INSENSIBILITY,
+      0., 0., "inf", "nan");
+  return converter;
+}
+
 }  // namespace
 
 namespace strings {
@@ -319,25 +332,31 @@ bool safe_strtou32(StringPiece str, uint32* value) {
 }
 
 bool safe_strtof(const char* str, float* value) {
-  const char* endptr;
-  *value = locale_independent_strtonum<float>(str, &endptr);
-  while (isspace(*endptr)) ++endptr;
-  // Ignore range errors from strtod/strtof.
-  // The values it returns on underflow and
-  // overflow are the right fallback in a
-  // robust setting.
-  return *str != '\0' && *endptr == '\0';
+  int processed_characters_count = -1;
+  auto len = str_util::Strnlen(str, kFastToBufferSize);
+
+  // If there is no zero-termination in str, fail.
+  if (len == kFastToBufferSize) return false;
+  // If string length exceeds int max, fail.
+  if (len > std::numeric_limits<int>::max()) return false;
+
+  *value = StringToFloatConverter().StringToFloat(str, static_cast<int>(len),
+                                                  &processed_characters_count);
+  return processed_characters_count > 0;
 }
 
 bool safe_strtod(const char* str, double* value) {
-  const char* endptr;
-  *value = locale_independent_strtonum<double>(str, &endptr);
-  while (isspace(*endptr)) ++endptr;
-  // Ignore range errors from strtod/strtof.
-  // The values it returns on underflow and
-  // overflow are the right fallback in a
-  // robust setting.
-  return *str != '\0' && *endptr == '\0';
+  int processed_characters_count = -1;
+  auto len = str_util::Strnlen(str, kFastToBufferSize);
+
+  // If there is no zero-termination in str, fail.
+  if (len == kFastToBufferSize) return false;
+  // If string length exceeds int max, fail.
+  if (len > std::numeric_limits<int>::max()) return false;
+
+  *value = StringToFloatConverter().StringToDouble(str, static_cast<int>(len),
+                                                   &processed_characters_count);
+  return processed_characters_count > 0;
 }
 
 size_t FloatToBuffer(float value, char* buffer) {
diff --git a/tensorflow/core/lib/strings/numbers.h b/tensorflow/core/lib/strings/numbers.h
index 6b7703be37..e9add42849 100644
--- a/tensorflow/core/lib/strings/numbers.h
+++ b/tensorflow/core/lib/strings/numbers.h
@@ -114,11 +114,13 @@ bool safe_strtou64(StringPiece str, uint64* value);
 // Convert strings to floating point values.
 // Leading and trailing spaces are allowed.
 // Values may be rounded on over- and underflow.
+// Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`.
 bool safe_strtof(const char* str, float* value);
 
 // Convert strings to double precision floating point values.
 // Leading and trailing spaces are allowed.
 // Values may be rounded on over- and underflow.
+// Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`.
 bool safe_strtod(const char* str, double* value);
 
 inline bool ProtoParseNumeric(StringPiece s, int32* value) {
diff --git a/tensorflow/core/lib/strings/numbers_test.cc b/tensorflow/core/lib/strings/numbers_test.cc
index e15161de66..0f22dac262 100644
--- a/tensorflow/core/lib/strings/numbers_test.cc
+++ b/tensorflow/core/lib/strings/numbers_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/numbers.h"
 
+#include <cmath>
 #include <string>
 #include "tensorflow/core/platform/test.h"
 
@@ -277,7 +278,49 @@ TEST(safe_strtof, Float) {
   EXPECT_TRUE(safe_strtof("-0x2A", &result));
   EXPECT_EQ(-42.0f, result);
 
+  EXPECT_TRUE(safe_strtof(" -0x2", &result));
+  EXPECT_EQ(-2.0f, result);
+
+  EXPECT_TRUE(safe_strtof("8 \t", &result));
+  EXPECT_EQ(8.0f, result);
+
+  EXPECT_TRUE(safe_strtof("\t20.0\t ", &result));
+  EXPECT_EQ(20.0f, result);
+
   EXPECT_FALSE(safe_strtof("-infinity is awesome", &result));
+
+  // Make sure we exit cleanly if the string is not terminated
+  char test_str[2 * kFastToBufferSize];
+  for (int i = 0; i < 2 * kFastToBufferSize; ++i) test_str[i] = 'a';
+  EXPECT_FALSE(safe_strtof(test_str, &result));
+
+  // Make sure we exit cleanly if the string is too long
+  test_str[kFastToBufferSize + 1] = '\0';
+  EXPECT_FALSE(safe_strtof(test_str, &result));
+
+  EXPECT_TRUE(safe_strtof("-inf", &result));
+  EXPECT_EQ(-std::numeric_limits<float>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtof("+inf", &result));
+  EXPECT_EQ(std::numeric_limits<float>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtof("InF", &result));
+  EXPECT_EQ(std::numeric_limits<float>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtof("-INF", &result));
+  EXPECT_EQ(-std::numeric_limits<float>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtof("nan", &result));
+  EXPECT_TRUE(std::isnan(result));
+
+  EXPECT_TRUE(safe_strtof("-nan", &result));
+  EXPECT_TRUE(std::isnan(result));
+
+  EXPECT_TRUE(safe_strtof("-NaN", &result));
+  EXPECT_TRUE(std::isnan(result));
+
+  EXPECT_TRUE(safe_strtof("+NAN", &result));
+  EXPECT_TRUE(std::isnan(result));
 }
 
 TEST(safe_strtod, Double) {
@@ -287,6 +330,15 @@ TEST(safe_strtod, Double) {
   EXPECT_EQ(0.1234567890123, result);
   EXPECT_FALSE(safe_strtod("0.1234567890123abc", &result));
 
+  // Make sure we exit cleanly if the string is not terminated
+  char test_str[2 * kFastToBufferSize];
+  for (int i = 0; i < 2 * kFastToBufferSize; ++i) test_str[i] = 'a';
+  EXPECT_FALSE(safe_strtod(test_str, &result));
+
+  // Make sure we exit cleanly if the string is too long
+  test_str[kFastToBufferSize + 1] = '\0';
+  EXPECT_FALSE(safe_strtod(test_str, &result));
+
   // Overflow to infinity, underflow to 0.
   EXPECT_TRUE(safe_strtod("1e310", &result));
   EXPECT_EQ(std::numeric_limits<double>::infinity(), result);
@@ -296,6 +348,41 @@ TEST(safe_strtod, Double) {
 
   EXPECT_TRUE(safe_strtod("1e-325", &result));
   EXPECT_EQ(0, result);
+
+  EXPECT_TRUE(safe_strtod(" -0x1c", &result));
+  EXPECT_EQ(-28.0, result);
+
+  EXPECT_TRUE(safe_strtod("50 \t", &result));
+  EXPECT_EQ(50.0, result);
+
+  EXPECT_TRUE(safe_strtod("\t82.0\t ", &result));
+  EXPECT_EQ(82.0, result);
+
+  EXPECT_FALSE(safe_strtod("infinity", &result));
+
+  EXPECT_TRUE(safe_strtod("-inf", &result));
+  EXPECT_EQ(-std::numeric_limits<double>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtod("+inf", &result));
+  EXPECT_EQ(std::numeric_limits<double>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtod("InF", &result));
+  EXPECT_EQ(std::numeric_limits<double>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtod("-INF", &result));
+  EXPECT_EQ(-std::numeric_limits<double>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtod("nan", &result));
+  EXPECT_TRUE(std::isnan(result));
+
+  EXPECT_TRUE(safe_strtod("-nan", &result));
+  EXPECT_TRUE(std::isnan(result));
+
+  EXPECT_TRUE(safe_strtod("-NaN", &result));
+  EXPECT_TRUE(std::isnan(result));
+
+  EXPECT_TRUE(safe_strtod("+NAN", &result));
+  EXPECT_TRUE(std::isnan(result));
 }
 
 }  // namespace strings
diff --git a/tensorflow/core/lib/strings/str_util.cc b/tensorflow/core/lib/strings/str_util.cc
index 2c9e98357a..4598b8ccc7 100644
--- a/tensorflow/core/lib/strings/str_util.cc
+++ b/tensorflow/core/lib/strings/str_util.cc
@@ -454,6 +454,14 @@ bool SplitAndParseAsFloats(StringPiece text, char delim,
                                     result);
 }
 
+size_t Strnlen(const char* str, const size_t string_max_len) {
+  size_t len = 0;
+  while (len < string_max_len && str[len] != '\0') {
+    ++len;
+  }
+  return len;
+}
+
 bool StrContains(StringPiece haystack, StringPiece needle) {
   return std::search(haystack.begin(), haystack.end(), needle.begin(),
                      needle.end()) != haystack.end();
diff --git a/tensorflow/core/lib/strings/str_util.h b/tensorflow/core/lib/strings/str_util.h
index 065871c1b4..e97d00b975 100644
--- a/tensorflow/core/lib/strings/str_util.h
+++ b/tensorflow/core/lib/strings/str_util.h
@@ -223,6 +223,11 @@ std::vector<string> Split(StringPiece text, char delims, Predicate p) {
   return Split(text, StringPiece(&delims, 1), p);
 }
 
+// Returns the length of the given null-terminated byte string 'str'.
+// Returns 'string_max_len' if the null character was not found in the first
+// 'string_max_len' bytes of 'str'.
+size_t Strnlen(const char* str, const size_t string_max_len);
+
 }  // namespace str_util
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/lib/strings/str_util_test.cc b/tensorflow/core/lib/strings/str_util_test.cc
index 63643c3e8e..3bf3e99825 100644
--- a/tensorflow/core/lib/strings/str_util_test.cc
+++ b/tensorflow/core/lib/strings/str_util_test.cc
@@ -430,56 +430,12 @@ TEST(StringReplace, EmptyStringReplaceAll) {
   EXPECT_EQ("", str_util::StringReplace("", "a", "X", /*replace_all=*/true));
 }
 
-TEST(StartsWith, Basic) {
-  const string s1(
-      "123"
-      "\0"
-      "456",
-      7);
-  const StringPiece a("foobar");
-  const StringPiece b(s1);
-  const StringPiece e;
-  EXPECT_TRUE(str_util::StartsWith(a, a));
-  EXPECT_TRUE(str_util::StartsWith(a, "foo"));
-  EXPECT_TRUE(str_util::StartsWith(a, e));
-  EXPECT_TRUE(str_util::StartsWith(b, s1));
-  EXPECT_TRUE(str_util::StartsWith(b, b));
-  EXPECT_TRUE(str_util::StartsWith(b, e));
-  EXPECT_TRUE(str_util::StartsWith(e, ""));
-  EXPECT_FALSE(str_util::StartsWith(a, b));
-  EXPECT_FALSE(str_util::StartsWith(b, a));
-  EXPECT_FALSE(str_util::StartsWith(e, a));
-}
-
-TEST(EndsWith, Basic) {
-  const string s1(
-      "123"
-      "\0"
-      "456",
-      7);
-  const StringPiece a("foobar");
-  const StringPiece b(s1);
-  const StringPiece e;
-  EXPECT_TRUE(str_util::EndsWith(a, a));
-  EXPECT_TRUE(str_util::EndsWith(a, "bar"));
-  EXPECT_TRUE(str_util::EndsWith(a, e));
-  EXPECT_TRUE(str_util::EndsWith(b, s1));
-  EXPECT_TRUE(str_util::EndsWith(b, b));
-  EXPECT_TRUE(str_util::EndsWith(b, e));
-  EXPECT_TRUE(str_util::EndsWith(e, ""));
-  EXPECT_FALSE(str_util::EndsWith(a, b));
-  EXPECT_FALSE(str_util::EndsWith(b, a));
-  EXPECT_FALSE(str_util::EndsWith(e, a));
-}
-
-TEST(StrContains, Basic) {
-  StringPiece a("abcdefg");
-  StringPiece b("abcd");
-  StringPiece c("efg");
-  StringPiece d("gh");
-  EXPECT_TRUE(str_util::StrContains(a, b));
-  EXPECT_TRUE(str_util::StrContains(a, c));
-  EXPECT_TRUE(!str_util::StrContains(a, d));
+TEST(Strnlen, Basic) {
+  EXPECT_EQ(0, str_util::Strnlen("ab", 0));
+  EXPECT_EQ(1, str_util::Strnlen("a", 1));
+  EXPECT_EQ(2, str_util::Strnlen("abcd", 2));
+  EXPECT_EQ(3, str_util::Strnlen("abc", 10));
+  EXPECT_EQ(4, str_util::Strnlen("a \t\n", 10));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 0ede8c6370..569b6678ca 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -118,6 +118,7 @@ genrule(
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
         "@curl//:COPYING",
+        "@double_conversion//:LICENSE",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
         "@fft2d//:fft/readme.txt",
@@ -155,6 +156,7 @@ genrule(
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
         "@curl//:COPYING",
+        "@double_conversion//:LICENSE",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
         "@fft2d//:fft/readme.txt",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 0ac5a5bb6d..7b508f87ab 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -128,6 +128,7 @@ filegroup(
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
         "@curl//:COPYING",
+        "@double_conversion//:LICENSE",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
         "@fft2d//:fft/readme.txt",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index aeaf8d7a24..bbef4b9e5f 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -693,6 +693,16 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = clean_dep("//third_party/flatbuffers:flatbuffers.BUILD"),
   )
 
+  native.new_http_archive(
+      name = "double_conversion",
+      urls = [
+          "https://github.com/google/double-conversion/archive/3992066a95b823efc8ccc1baf82a1cfc73f6e9b8.zip",
+      ],
+      sha256 = "2f7fbffac0d98d201ad0586f686034371a6d152ca67508ab611adc2386ad30de",
+      strip_prefix = "double-conversion-3992066a95b823efc8ccc1baf82a1cfc73f6e9b8",
+      build_file = clean_dep("//third_party:double_conversion.BUILD")
+  )
+
   tf_http_archive(
       name = "tflite_mobilenet",
       sha256 = "23f814d1c076bdf03715dfb6cab3713aa4fbdf040fd5448c43196bd2e97a4c1b",
diff --git a/third_party/double_conversion.BUILD b/third_party/double_conversion.BUILD
new file mode 100644
index 0000000000..9f905216c0
--- /dev/null
+++ b/third_party/double_conversion.BUILD
@@ -0,0 +1,38 @@
+# Bazel(http://bazel.io) BUILD file
+
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "double-conversion",
+    srcs = [
+        "double-conversion/bignum.cc",
+        "double-conversion/bignum-dtoa.cc",
+        "double-conversion/cached-powers.cc",
+        "double-conversion/diy-fp.cc",
+        "double-conversion/double-conversion.cc",
+        "double-conversion/fast-dtoa.cc",
+        "double-conversion/fixed-dtoa.cc",
+        "double-conversion/strtod.cc",
+        "double-conversion/utils.h",
+    ],
+    hdrs = [
+        "double-conversion/bignum.h",
+        "double-conversion/bignum-dtoa.h",
+        "double-conversion/cached-powers.h",
+        "double-conversion/diy-fp.h",
+        "double-conversion/double-conversion.h",
+        "double-conversion/fast-dtoa.h",
+        "double-conversion/fixed-dtoa.h",
+        "double-conversion/ieee.h",
+        "double-conversion/strtod.h",
+    ],
+    includes = [
+        ".",
+    ],
+    linkopts = [
+        "-lm",
+    ],
+    visibility = ["//visibility:public"],
+)
-- 
GitLab


From 5fbb1feecd77a70b32d333b56bd13b1798b9a766 Mon Sep 17 00:00:00 2001
From: James Qin <jamesqin@google.com>
Date: Fri, 20 Apr 2018 11:23:29 -0700
Subject: [PATCH 233/450] Temporarily set cudnn Rnn math precision to fp32.

Problem:
When calling cudnnGetRNNLinLayerMatrixParams(), return error CUDNN_STATUS_BAD_PARAM if:

* RNN descriptor set math precision = CUDNN_DATA_FLOAT
* input descriptor dataType = CUDNN_DATA_HALF
* weight descriptor dataType= CUDNN_DATA_HALF

If updating Rnn descriptor math precision to CUDNN_DATA_HALF, then no error.

cudnn 7.1.4 will fix the problem.

PiperOrigin-RevId: 193696566
---
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index d673e19007..640f270323 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -2529,12 +2529,20 @@ cudnnDataType_t GetConvComputeType<double>() {
 }
 
 // A helper struct to decide whether to use FP32 as the internal compute type
-// for rnn when the input data type is FP16. By default it is turned on,
-// users can explicitly disable them (choose to use FP16 as the internal compute
-// type) through an env-var "TF_FP16_RNN_USE_FP32_COMPUTE=0".
+// for rnn when the input data type is FP16. At present it is turned off,
+// users can explicitly control them through an env-var
+// TF_FP16_RNN_USE_FP32_COMPUTE.
+// After the TODO below is fixed, users should almost always use fp32 compute
+// type for training. Using fp16 might suffer suboptimal accuracy due to loss
+// in precision.
 struct RnnDoFP32ComputationFP16Input {
   static constexpr const char* kName = "TF_FP16_RNN_USE_FP32_COMPUTE";
-  static constexpr bool kDefaultFlag = true;
+  // TODO(jamesqin): b/78182362 flip to true when cudnn 7.1.4 fixes the bug.
+  // Before cudnn 7.1.4 RNN are always done in fp32, no matter what math
+  // precision is set.
+  // Set it temporary to false s.t. no error is raised when using fp16 inputs,
+  // fp32 math precision.
+  static constexpr bool kDefaultFlag = false;
 };
 
 // A helper function to return the internal compute type for
-- 
GitLab


From 712bbc5d7babd523951445f361f0e339061cd259 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Fri, 20 Apr 2018 11:24:53 -0700
Subject: [PATCH 234/450] Allow creating tensors from numpy arrays, and other
 various constants - try #2

Allow type-inference from a different input tensor, similar to args_to_matching_eager.

- Update TFE_Py_TensorShapeSlice to take tuples.
- Update int values to allow int/long in py2
END_PUBLIC

BEGIN_PUBLIC
Automated g4 rollback of changelist 192184809

PiperOrigin-RevId: 193696790
---
 tensorflow/python/eager/pywrap_tensor.cc  | 201 ++++++++--------
 tensorflow/python/eager/pywrap_tensor.h   |  10 +
 tensorflow/python/eager/pywrap_tfe.h      |  12 +-
 tensorflow/python/eager/pywrap_tfe_src.cc | 278 +++++++++++++++++++---
 tensorflow/python/eager/tensor_test.py    |   7 +-
 tensorflow/python/framework/ops.py        |  16 ++
 6 files changed, 389 insertions(+), 135 deletions(-)

diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 519814b979..b5b4e394e3 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -60,42 +60,6 @@ TFE_TensorHandle* NumpyToTensorHandle(PyObject* obj) {
   }
 }
 
-// Casts data referred to by `handle` from type `src_type_enum` to type
-// `dst_type_enum`.
-TFE_TensorHandle* EagerCast(TFE_Context* ctx, TFE_TensorHandle* handle,
-                            TF_DataType src_type_enum,
-                            TF_DataType dst_type_enum, TF_Status* out_status) {
-  if (ctx == nullptr) return nullptr;
-  const char* op_name = "Cast";
-  const char* device_name = "/job:localhost/replica:0/task:0/device:CPU:0";
-  TFE_Op* op = TFE_NewOp(ctx, op_name, out_status);
-#define RETURN_ERROR  \
-  {                   \
-    TFE_DeleteOp(op); \
-    return nullptr;   \
-  }
-  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
-  TFE_OpSetDevice(op, device_name, out_status);
-  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
-  TFE_OpAddInput(op, handle, out_status);
-  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
-  TFE_OpSetAttrType(op, "SrcT", src_type_enum);
-  TFE_OpSetAttrType(op, "DstT", dst_type_enum);
-  TFE_TensorHandle* output = nullptr;
-  int num_outputs = 1;
-  TFE_Execute(op, &output, &num_outputs, out_status);
-  if (TF_GetCode(out_status) != TF_OK || num_outputs != 1 ||
-      output == nullptr) {
-    if (output != nullptr) {
-      TFE_DeleteTensorHandle(output);
-    }
-    RETURN_ERROR
-  }
-  TFE_DeleteOp(op);
-  return output;
-#undef RETURN_ERROR
-}
-
 TFE_TensorHandle* CopyToDevice(TFE_TensorHandle* handle, PyObject* ctx,
                                PyObject* dev) {
   const char* device = "";
@@ -161,6 +125,100 @@ PyObject* PyIntFromDataType(TF_DataType l) {
 
 }  // namespace
 
+namespace tensorflow {
+// Casts data referred to by `handle` from type `src_type_enum` to type
+// `dst_type_enum`.
+TFE_TensorHandle* EagerCast(TFE_Context* ctx, TFE_TensorHandle* handle,
+                            TF_DataType src_type_enum,
+                            TF_DataType dst_type_enum, TF_Status* out_status) {
+  if (ctx == nullptr) return nullptr;
+  const char* op_name = "Cast";
+  const char* device_name = "/job:localhost/replica:0/task:0/device:CPU:0";
+  TFE_Op* op = TFE_NewOp(ctx, op_name, out_status);
+#define RETURN_ERROR  \
+  {                   \
+    TFE_DeleteOp(op); \
+    return nullptr;   \
+  }
+  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
+  TFE_OpSetDevice(op, device_name, out_status);
+  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
+  TFE_OpAddInput(op, handle, out_status);
+  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
+  TFE_OpSetAttrType(op, "SrcT", src_type_enum);
+  TFE_OpSetAttrType(op, "DstT", dst_type_enum);
+  TFE_TensorHandle* output = nullptr;
+  int num_outputs = 1;
+  TFE_Execute(op, &output, &num_outputs, out_status);
+  if (TF_GetCode(out_status) != TF_OK || num_outputs != 1 ||
+      output == nullptr) {
+    if (output != nullptr) {
+      TFE_DeleteTensorHandle(output);
+    }
+    RETURN_ERROR
+  }
+  TFE_DeleteOp(op);
+  return output;
+#undef RETURN_ERROR
+}
+
+TFE_TensorHandle* ConvertToEagerTensor(PyObject* value, PyObject* dtype) {
+  int desired_dtype = -1;
+  if (dtype != Py_None) {
+    if (!PyIntToDataType(dtype, &desired_dtype)) {
+      PyErr_SetString(PyExc_TypeError,
+                      tensorflow::strings::StrCat(
+                          "Expecting a DataType value for dtype. Got ",
+                          Py_TYPE(dtype)->tp_name)
+                          .c_str());
+      return nullptr;
+    }
+  }
+  if (PyArray_Check(value)) {
+    int desired_np_dtype = -1;
+    if (desired_dtype >= 0) {
+      if (!tensorflow::TF_DataType_to_PyArray_TYPE(
+               static_cast<TF_DataType>(desired_dtype), &desired_np_dtype)
+               .ok()) {
+        PyErr_SetString(PyExc_TypeError,
+                        tensorflow::strings::StrCat(
+                            "Invalid dtype argument value ", desired_dtype)
+                            .c_str());
+        return nullptr;
+      }
+    }
+    PyArrayObject* array = reinterpret_cast<PyArrayObject*>(value);
+    int current_np_dtype = PyArray_TYPE(array);
+    auto safe_value = tensorflow::make_safe(static_cast<PyObject*>(nullptr));
+    if ((desired_np_dtype >= 0 && desired_np_dtype != current_np_dtype) ||
+        !PyArray_ISCARRAY(array)) {
+      int new_dtype =
+          desired_np_dtype >= 0 ? desired_np_dtype : current_np_dtype;
+      safe_value = tensorflow::make_safe(
+          PyArray_FromAny(value, PyArray_DescrFromType(new_dtype), 0, 0,
+                          NPY_ARRAY_CARRAY | NPY_ARRAY_FORCECAST, nullptr));
+      if (PyErr_Occurred()) return nullptr;
+      if (safe_value == nullptr) {
+        PyErr_SetString(PyExc_ValueError, "Error while casting a numpy value");
+        return nullptr;
+      }
+      value = safe_value.get();
+    }
+    return NumpyToTensorHandle(value);
+  } else {
+    tensorflow::Tensor t;
+    // TODO(josh11b): Have PySeqToTensor set python errors instead of
+    // returning Status.
+    auto cppstatus = tensorflow::PySeqToTensor(value, dtype, &t);
+    if (!cppstatus.ok()) {
+      PyErr_SetString(PyExc_ValueError, cppstatus.error_message().c_str());
+      return nullptr;
+    }
+    return TFE_NewTensorHandle(t);
+  }
+}
+}  // namespace tensorflow
+
 extern "C" {
 
 static const int kMaxEagerTensorParentSize = 64;
@@ -230,61 +288,16 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
       return -1;
     }
   }
-  tensorflow::Safe_TFE_TensorHandlePtr handle =
-      tensorflow::make_safe(static_cast<TFE_TensorHandle*>(nullptr));
   PyErr_Clear();
-  if (PyArray_Check(value)) {
-    int desired_np_dtype = -1;
-    if (desired_dtype >= 0) {
-      if (!tensorflow::TF_DataType_to_PyArray_TYPE(
-               static_cast<TF_DataType>(desired_dtype), &desired_np_dtype)
-               .ok()) {
-        PyErr_SetString(PyExc_TypeError,
-                        tensorflow::strings::StrCat(
-                            "Invalid dtype argument value ", desired_dtype)
-                            .c_str());
-        return -1;
-      }
-    }
-    PyArrayObject* array = reinterpret_cast<PyArrayObject*>(value);
-    int current_np_dtype = PyArray_TYPE(array);
-    auto safe_value = tensorflow::make_safe(static_cast<PyObject*>(nullptr));
-    if ((desired_np_dtype >= 0 && desired_np_dtype != current_np_dtype) ||
-        !PyArray_ISCARRAY(array)) {
-      int new_dtype =
-          desired_np_dtype >= 0 ? desired_np_dtype : current_np_dtype;
-      safe_value = tensorflow::make_safe(
-          PyArray_FromAny(value, PyArray_DescrFromType(new_dtype), 0, 0,
-                          NPY_ARRAY_CARRAY | NPY_ARRAY_FORCECAST, nullptr));
-      if (PyErr_Occurred()) return -1;
-      if (safe_value == nullptr) {
-        PyErr_SetString(PyExc_ValueError, "Error while casting a numpy value");
-        return -1;
-      }
-      value = safe_value.get();
-    }
-    handle = tensorflow::make_safe(NumpyToTensorHandle(value));
-  } else {
-    tensorflow::Tensor t;
-    // TODO(josh11b): Have PySeqToTensor set python errors instead of
-    // returning Status.
-    auto cppstatus = tensorflow::PySeqToTensor(value, dtype, &t);
-    if (!cppstatus.ok()) {
-      PyErr_SetString(PyExc_ValueError, cppstatus.error_message().c_str());
-      return -1;
-    }
-    handle = tensorflow::make_safe(TFE_NewTensorHandle(t));
-  }
-  if (PyErr_Occurred()) return -1;
-  if (handle == nullptr) {
-    PyErr_SetString(PyExc_ValueError, "Error while creating an EagerTensor");
-    return -1;
-  }
+  tensorflow::Safe_TFE_TensorHandlePtr handle =
+      tensorflow::make_safe(static_cast<TFE_TensorHandle*>(
+          tensorflow::ConvertToEagerTensor(value, dtype)));
+  if (handle == nullptr) return -1;
   TF_DataType handle_dtype = TFE_TensorHandleDataType(handle.get());
   if (desired_dtype >= 0 && desired_dtype != handle_dtype) {
-    handle = tensorflow::make_safe(
-        EagerCast(GetContext(context), handle.get(), handle_dtype,
-                  static_cast<TF_DataType>(desired_dtype), self->status));
+    handle = tensorflow::make_safe(tensorflow::EagerCast(
+        GetContext(context), handle.get(), handle_dtype,
+        static_cast<TF_DataType>(desired_dtype), self->status));
     if (TF_GetCode(self->status) != TF_OK) {
       PyErr_SetString(PyExc_ValueError,
                       tensorflow::strings::StrCat(
@@ -701,12 +714,12 @@ PyObject* TFE_Py_InitEagerTensor(PyObject* base_class) {
   return reinterpret_cast<PyObject*>(EagerTensorType);
 }
 
-PyObject* TFE_Py_TensorShapeSlice(PyObject* tensor_list, int slice_dim) {
-  if (!PyList_Check(tensor_list)) {
+PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim) {
+  if (!PyList_Check(tensors) && !PyTuple_Check(tensors)) {
     PyErr_SetString(PyExc_TypeError,
                     tensorflow::strings::StrCat(
-                        "tensor_list argument must be a list. Got \"",
-                        Py_TYPE(tensor_list)->tp_name, "\"")
+                        "tensors argument must be a list or a tuple. Got \"",
+                        Py_TYPE(tensors)->tp_name, "\"")
                         .c_str());
     return nullptr;
   }
@@ -720,14 +733,14 @@ PyObject* TFE_Py_TensorShapeSlice(PyObject* tensor_list, int slice_dim) {
     return nullptr;
   }
 
-  Py_ssize_t num_tensors = PyList_Size(tensor_list);
+  Py_ssize_t num_tensors = PySequence_Fast_GET_SIZE(tensors);
   int64_t num_tensors_int = static_cast<int64_t>(num_tensors);
   auto tensor = tensorflow::make_safe(TF_AllocateTensor(
       TF_INT32, &num_tensors_int, /*num_dims=*/1, /*len=*/4 * num_tensors_int));
   int32_t* data = reinterpret_cast<int32_t*>(TF_TensorData(tensor.get()));
   auto status = tensorflow::make_safe(TF_NewStatus());
   for (Py_ssize_t i = 0; i < num_tensors; ++i) {
-    PyObject* tensor_obj = PyList_GET_ITEM(tensor_list, i);
+    PyObject* tensor_obj = PySequence_Fast_GET_ITEM(tensors, i);
     if (!EagerTensor_CheckExact(tensor_obj)) {
       PyErr_SetString(PyExc_TypeError,
                       tensorflow::strings::StrCat(
diff --git a/tensorflow/python/eager/pywrap_tensor.h b/tensorflow/python/eager/pywrap_tensor.h
index aa1efdd1b8..63ab1ed84d 100644
--- a/tensorflow/python/eager/pywrap_tensor.h
+++ b/tensorflow/python/eager/pywrap_tensor.h
@@ -22,4 +22,14 @@ limitations under the License.
 bool EagerTensor_CheckExact(const PyObject* o);
 tensorflow::int64 EagerTensor_id(const PyObject* tensor);
 
+namespace tensorflow {
+TFE_TensorHandle* ConvertToEagerTensor(PyObject* value, PyObject* dtype);
+
+// TODO(nareshmodi): Move EagerCast and ReadVariableOp (which use the C API to
+// execute TFE Ops) to a separate common library.
+TFE_TensorHandle* EagerCast(TFE_Context* ctx, TFE_TensorHandle* handle,
+                            TF_DataType src_type_enum,
+                            TF_DataType dst_type_enum, TF_Status* out_status);
+}
+
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_H_
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 32d731d0f6..691b613e48 100644
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -186,16 +186,16 @@ PyObject* TFE_Py_RecordGradient(PyObject* op_name, PyObject* inputs,
 // Returns the set of variables watched by the given tape.
 PyObject* TFE_Py_TapeWatchedVariables(PyObject* tape);
 
-// Returns an EagerTensor of dimension [len(`tensor_list`)] containing
-// the `slice_dim`'th dimension of each tensor in `tensor_list`. In other words,
+// Returns an EagerTensor of dimension [len(`tensors`)] containing
+// the `slice_dim`'th dimension of each tensor in `tensors`. In other words,
 // TFE_Py_TensorShapeSlice takes a slice of dimensions of tensors in
-// `tensor_list`. For example, if `tensor_list` contains tensors of with shapes
+// `tensors`. For example, if `tensors` contains tensors of with shapes
 // [1, 2, 3], [4, 5], [6, 7, 8, 9], TFE_Py_TensorShapeSlice called with
 // `slice_dim` equal to 1 will return [2, 5, 7].
 // On error, returns nullptr and sets python exception.
-// REQUIRES: `tensor_list` is a python list of EagerTensors
+// REQUIRES: `tensors` is a python list/tuple of EagerTensors
 // REQUIRES: `slice_dim` is non-negative and smaller than the rank of all
-//   tensors in `tensor_list`.
-PyObject* TFE_Py_TensorShapeSlice(PyObject* tensor_list, int slice_dim);
+//   tensors in `tensors`.
+PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim);
 
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TFE_H_
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index d99bd0b0ff..2bfa1f052c 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -38,6 +38,54 @@ using tensorflow::strings::Printf;
 
 namespace {
 
+struct InputInfo {
+  InputInfo(int i, bool is_list) : i(i), is_list(is_list) {}
+
+  int i;
+  bool is_list = false;
+};
+
+using AttrToInputsMap =
+    tensorflow::gtl::FlatMap<string,
+                             tensorflow::gtl::InlinedVector<InputInfo, 4>>;
+
+tensorflow::mutex all_attr_to_input_maps_lock(
+    tensorflow::LINKER_INITIALIZED);
+tensorflow::gtl::FlatMap<string, AttrToInputsMap*>* GetAllAttrToInputsMaps() {
+  static auto* all_attr_to_input_maps =
+      new tensorflow::gtl::FlatMap<string, AttrToInputsMap*>;
+  return all_attr_to_input_maps;
+}
+
+AttrToInputsMap* GetAttrToInputsMap(const tensorflow::OpDef& op_def) {
+  tensorflow::mutex_lock l(all_attr_to_input_maps_lock);
+  auto* all_attr_to_input_maps = GetAllAttrToInputsMaps();
+
+  auto* output =
+      tensorflow::gtl::FindPtrOrNull(*all_attr_to_input_maps, op_def.name());
+  if (output != nullptr) {
+    return output;
+  }
+
+  std::unique_ptr<AttrToInputsMap> m(new AttrToInputsMap);
+
+  // Store a list of InputIndex -> List of corresponding inputs.
+  for (int i = 0; i < op_def.input_arg_size(); i++) {
+    if (!op_def.input_arg(i).type_attr().empty()) {
+      auto it = m->find(op_def.input_arg(i).type_attr());
+      if (it == m->end()) {
+        it = m->insert({op_def.input_arg(i).type_attr(), {}}).first;
+      }
+      it->second.emplace_back(i, !op_def.input_arg(i).number_attr().empty());
+    }
+  }
+
+  auto* retval = m.get();
+  (*all_attr_to_input_maps)[op_def.name()] = m.release();
+
+  return retval;
+}
+
 struct FastPathOpExecInfo {
   TFE_Context* ctx;
   const char* device_name;
@@ -53,6 +101,14 @@ struct FastPathOpExecInfo {
   // The op type name of the main op being executed.
   PyObject* op_name;
   PyObject* callbacks;
+
+  // All the args passed into the FastPathOpExecInfo.
+  PyObject* args;
+
+  // DTypes can come from another input that has the same attr. So build that
+  // map.
+  const AttrToInputsMap* attr_to_inputs_map;
+  tensorflow::gtl::FlatMap<string, tensorflow::DataType> cached_dtypes;
 };
 
 #define PARSE_VALUE(fn_name, type, check_fn, parse_fn)                       \
@@ -76,12 +132,29 @@ PARSE_VALUE(ParseIntValue, int, PyLong_Check, PyLong_AsLong)
 PARSE_VALUE(ParseInt64Value, int64_t, PyLong_Check, PyLong_AsLong)
 #else
 PARSE_VALUE(ParseIntValue, int, PyInt_Check, PyInt_AsLong)
-PARSE_VALUE(ParseInt64Value, int64_t, PyInt_Check, PyInt_AsLong)
-PARSE_VALUE(ParseInt64LongValue, int64_t, PyLong_Check, PyLong_AsLong)
 #endif
 PARSE_VALUE(ParseFloatValue, float, PyFloat_Check, PyFloat_AsDouble)
 #undef PARSE_VALUE
 
+#if PY_MAJOR_VERSION < 3
+bool ParseInt64Value(const string& key, PyObject* py_value, TF_Status* status,
+                     int64_t* value) {
+  if (PyInt_Check(py_value)) {
+    *value = static_cast<int64_t>(PyInt_AsLong(py_value));
+    return true;
+  } else if (PyLong_Check(py_value)) {
+    *value = static_cast<int64_t>(PyLong_AsLong(py_value));
+    return true;
+  }
+  TF_SetStatus(
+      status, TF_INVALID_ARGUMENT,
+      tensorflow::strings::StrCat("Expecting int or long value for attr ", key,
+                                  ", got ", py_value->ob_type->tp_name)
+          .c_str());
+  return false;
+}
+#endif
+
 Py_ssize_t TensorShapeNumDims(PyObject* value) {
   const auto size = PySequence_Size(value);
   if (size == -1) {
@@ -234,7 +307,7 @@ bool SetOpAttrList(
     std::unique_ptr<int64_t[]> buffer(new int64_t[total_dims]);
     // Copy the input dims into the buffer and set dims to point to
     // the start of each list's dims.
-    std::unique_ptr<const int64_t* []> dims(new const int64_t*[num_values]);
+    std::unique_ptr<const int64_t*[]> dims(new const int64_t*[num_values]);
     std::unique_ptr<int[]> num_dims(new int[num_values]);
     int64_t* offset = buffer.get();
     for (int i = 0; i < num_values; ++i) {
@@ -296,7 +369,7 @@ void SetOpAttrListDefault(
     TF_Status* status) {
   if (type == TF_ATTR_STRING) {
     int num_values = attr.default_value().list().s_size();
-    std::unique_ptr<const char* []> values(new const char*[num_values]);
+    std::unique_ptr<const char*[]> values(new const char*[num_values]);
     (*attr_list_sizes)[key] = num_values;
     for (int i = 0; i < num_values; i++) {
       values[i] = attr.default_value().list().s(i).data();
@@ -349,7 +422,7 @@ void SetOpAttrListDefault(
     std::unique_ptr<int64_t[]> buffer(new int64_t[total_dims]);
     // Copy the input dims into the buffer and set dims to point to
     // the start of each list's dims.
-    std::unique_ptr<const int64_t* []> dims(new const int64_t*[num_values]);
+    std::unique_ptr<const int64_t*[]> dims(new const int64_t*[num_values]);
     std::unique_ptr<int[]> num_dims(new int[num_values]);
     int64_t* offset = buffer.get();
     for (int i = 0; i < num_values; ++i) {
@@ -369,7 +442,7 @@ void SetOpAttrListDefault(
   } else if (type == TF_ATTR_FUNC) {
     int num_values = attr.default_value().list().func_size();
     (*attr_list_sizes)[key] = num_values;
-    std::unique_ptr<const TFE_Op* []> funcs(new const TFE_Op*[num_values]);
+    std::unique_ptr<const TFE_Op*[]> funcs(new const TFE_Op*[num_values]);
     for (int i = 0; i < num_values; i++) {
       funcs[i] = GetFunc(ctx, attr.default_value().list().func(i), status);
     }
@@ -1399,10 +1472,39 @@ PyObject* GetPythonObjectFromString(const char* s) {
 #endif
 }
 
+PyObject* GetPythonObjectFromInt(int num) {
+#if PY_MAJOR_VERSION >= 3
+  return PyLong_FromLong(num);
+#else
+  return PyInt_FromLong(num);
+#endif
+}
+
 bool CheckResourceVariable(PyObject* item) {
   return PyObject_TypeCheck(item, resource_variable_type);
 }
 
+bool IsNumberType(PyObject* item) {
+#if PY_MAJOR_VERSION >= 3
+  return PyFloat_Check(item) || PyLong_Check(item);
+#else
+  return PyFloat_Check(item) || PyInt_Check(item) || PyLong_Check(item);
+#endif
+}
+
+bool CheckOneInput(PyObject* item) {
+  if (EagerTensor_CheckExact(item) || CheckResourceVariable(item) ||
+      PyArray_Check(item) || IsNumberType(item)) {
+    return true;
+  }
+
+  // Sequences are not properly handled. Sequences with purely python numeric
+  // types work, but sequences with mixes of EagerTensors and python numeric
+  // types don't work.
+  // TODO(nareshmodi): fix
+  return false;
+}
+
 bool CheckInputsOk(PyObject* seq, int start_index,
                    const tensorflow::OpDef& op_def) {
   for (int i = 0; i < op_def.input_arg_size(); i++) {
@@ -1419,8 +1521,7 @@ bool CheckInputsOk(PyObject* seq, int start_index,
       }
       for (Py_ssize_t j = 0; j < PySequence_Fast_GET_SIZE(item); j++) {
         PyObject* inner_item = PySequence_Fast_GET_ITEM(item, j);
-        if (!EagerTensor_CheckExact(inner_item) &&
-            !CheckResourceVariable(inner_item)) {
+        if (!CheckOneInput(inner_item)) {
           VLOG(1)
               << "Falling back to slow path for Op \"" << op_def.name()
               << "\", Input \"" << op_def.input_arg(i).name() << "\", Index "
@@ -1430,7 +1531,7 @@ bool CheckInputsOk(PyObject* seq, int start_index,
           return false;
         }
       }
-    } else if (!EagerTensor_CheckExact(item) && !CheckResourceVariable(item)) {
+    } else if (!CheckOneInput(item)) {
       VLOG(1)
           << "Falling back to slow path for Op \"" << op_def.name()
           << "\", Input \"" << op_def.input_arg(i).name()
@@ -1443,6 +1544,52 @@ bool CheckInputsOk(PyObject* seq, int start_index,
   return true;
 }
 
+PyObject* MaybeGetDType(PyObject* item) {
+  if (EagerTensor_CheckExact(item)) {
+    tensorflow::Safe_PyObjectPtr py_dtype(
+        PyObject_GetAttrString(item, "dtype"));
+    return PyObject_GetAttrString(py_dtype.get(), "_type_enum");
+  }
+
+  if (CheckResourceVariable(item)) {
+    tensorflow::Safe_PyObjectPtr py_dtype(
+        PyObject_GetAttrString(item, "_dtype"));
+    return PyObject_GetAttrString(py_dtype.get(), "_type_enum");
+  }
+
+  return nullptr;
+}
+
+PyObject* MaybeGetDTypeForAttr(const string& attr,
+                               FastPathOpExecInfo* op_exec_info) {
+  auto cached_it = op_exec_info->cached_dtypes.find(attr);
+  if (cached_it != op_exec_info->cached_dtypes.end()) {
+    return GetPythonObjectFromInt(cached_it->second);
+  }
+
+  auto it = op_exec_info->attr_to_inputs_map->find(attr);
+  if (it == op_exec_info->attr_to_inputs_map->end()) {
+    // No other inputs - this should never happen.
+    Py_RETURN_NONE;
+  }
+
+  for (const auto& input_info : it->second) {
+    PyObject* item = PyTuple_GET_ITEM(
+        op_exec_info->args, kFastPathExecuteInputStartIndex + input_info.i);
+    if (input_info.is_list) {
+      for (int i = 0; i < PySequence_Fast_GET_SIZE(item); i++) {
+        auto* dtype = MaybeGetDType(PySequence_Fast_GET_ITEM(item, i));
+        if (dtype != nullptr) return dtype;
+      }
+    } else {
+      auto* dtype = MaybeGetDType(item);
+      if (dtype != nullptr) return dtype;
+    }
+  }
+
+  Py_RETURN_NONE;
+}
+
 bool OpDoesntRequireOutput(const string& op_name) {
   static tensorflow::gtl::FlatSet<string>* ops_that_dont_require_outputs =
       new tensorflow::gtl::FlatSet<string>({
@@ -1668,23 +1815,80 @@ bool ReadVariableOp(const FastPathOpExecInfo& parent_op_exec_info,
 //  i) input is an EagerTensor
 //  ii) input is a ResourceVariable - in this case, the is_variable param is set
 //  to true.
-bool ConvertToTensor(const FastPathOpExecInfo& op_exec_info, PyObject* input,
-                     tensorflow::Safe_PyObjectPtr* output_handle,
-                     TF_Status* status) {
-  if (CheckResourceVariable(input)) {
+//
+//  NOTE: dtype_hint_getter must *always* return a PyObject that can be
+//  decref'd. So if no hint is found, Py_RETURN_NONE (which correctly
+//  increfs Py_None).
+bool ConvertToTensor(
+    const FastPathOpExecInfo& op_exec_info, PyObject* input,
+    tensorflow::Safe_PyObjectPtr* output_handle,
+    // This gets a hint for this particular input.
+    const std::function<PyObject*()>& dtype_hint_getter,
+    // This sets the dtype after conversion is complete.
+    const std::function<void(const TF_DataType& dtype)>& dtype_setter,
+    TF_Status* status) {
+  if (EagerTensor_CheckExact(input)) {
+    Py_INCREF(input);
+    output_handle->reset(input);
+    return true;
+  } else if (CheckResourceVariable(input)) {
     return ReadVariableOp(op_exec_info, input, output_handle, status);
   }
 
-  Py_INCREF(input);
-  output_handle->reset(input);
+  // The hint comes from a supposedly similarly typed tensor.
+  tensorflow::Safe_PyObjectPtr dtype_hint(dtype_hint_getter());
+  if (PyErr_Occurred()) {
+    return false;
+  }
+
+  tensorflow::Safe_TFE_TensorHandlePtr handle =
+      tensorflow::make_safe(static_cast<TFE_TensorHandle*>(
+          tensorflow::ConvertToEagerTensor(input, dtype_hint.get())));
+  if (handle == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Unable to convert value to tensor");
+    return false;
+  }
+
+  int desired_dtype = -1;
+  if (dtype_hint.get() != Py_None) {
+    if (!ParseTypeValue("", dtype_hint.get(), status, &desired_dtype)) {
+      status->status = tensorflow::errors::InvalidArgument(
+          "Expecting a DataType value for dtype. Got ",
+          Py_TYPE(dtype_hint.get())->tp_name);
+    }
+  }
+
+  TF_DataType handle_dtype = TFE_TensorHandleDataType(handle.get());
+  if (desired_dtype >= 0 && desired_dtype != handle_dtype) {
+    handle = tensorflow::make_safe(
+        tensorflow::EagerCast(op_exec_info.ctx, handle.get(), handle_dtype,
+                              static_cast<TF_DataType>(desired_dtype), status));
+    if (!status->status.ok()) return false;
+
+    handle_dtype = TFE_TensorHandleDataType(handle.get());
+  }
+
+  if (handle_dtype != TF_INT32) {
+    // Note that this is a shallow copy and will share the underlying buffer
+    // if copying to the same device.
+    handle = tensorflow::make_safe(TFE_TensorHandleCopyToDevice(
+        handle.get(), op_exec_info.ctx, op_exec_info.device_name, status));
+    if (!status->status.ok()) return false;
+  }
+
+  output_handle->reset(EagerTensorFromHandle(handle.release()));
+
+  dtype_setter(handle_dtype);
 
   return true;
 }
 
 // Adds input and type attr to the op, and to the list of flattened
 // inputs/attrs.
-bool AddInputToOp(const FastPathOpExecInfo& op_exec_info, PyObject* input,
-                  const tensorflow::OpDef::ArgDef* input_arg,
+bool AddInputToOp(FastPathOpExecInfo* op_exec_info, PyObject* input,
+                  const bool add_type_attr,
+                  const tensorflow::OpDef::ArgDef& input_arg,
                   std::vector<tensorflow::Safe_PyObjectPtr>* flattened_attrs,
                   std::vector<tensorflow::Safe_PyObjectPtr>* flattened_inputs,
                   TFE_Op* op, TF_Status* status) {
@@ -1693,18 +1897,30 @@ bool AddInputToOp(const FastPathOpExecInfo& op_exec_info, PyObject* input,
   // out of scope in this function.
   tensorflow::Safe_PyObjectPtr py_eager_tensor = nullptr;
 
-  if (!ConvertToTensor(op_exec_info, input, &py_eager_tensor, status)) {
+  if (!ConvertToTensor(
+          *op_exec_info, input, &py_eager_tensor,
+          [&]() {
+            if (input_arg.type() != tensorflow::DataType::DT_INVALID) {
+              return GetPythonObjectFromInt(input_arg.type());
+            }
+            return MaybeGetDTypeForAttr(input_arg.type_attr(), op_exec_info);
+          },
+          [&](const TF_DataType dtype) {
+            op_exec_info->cached_dtypes[input_arg.type_attr()] =
+                static_cast<tensorflow::DataType>(dtype);
+          },
+          status)) {
     return false;
   }
 
   TFE_TensorHandle* input_handle = EagerTensor_Handle(py_eager_tensor.get());
 
-  if (input_arg != nullptr && !input_arg->type_attr().empty()) {
+  if (add_type_attr && !input_arg.type_attr().empty()) {
     auto dtype = TFE_TensorHandleDataType(input_handle);
-    TFE_OpSetAttrType(op, input_arg->type_attr().data(), dtype);
+    TFE_OpSetAttrType(op, input_arg.type_attr().data(), dtype);
     if (flattened_attrs != nullptr) {
       flattened_attrs->emplace_back(
-          GetPythonObjectFromString(input_arg->type_attr().data()));
+          GetPythonObjectFromString(input_arg.type_attr().data()));
       flattened_attrs->emplace_back(PyLong_FromLong(dtype));
     }
   }
@@ -1844,6 +2060,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
 
   op_exec_info.ctx = reinterpret_cast<TFE_Context*>(
       PyCapsule_GetPointer(PyTuple_GET_ITEM(args, 0), nullptr));
+  op_exec_info.args = args;
 
   if (op_exec_info.ctx == nullptr) {
     // The context hasn't been initialized. It will be in the slow path.
@@ -1892,6 +2109,8 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     return nullptr;
   }
 
+  op_exec_info.attr_to_inputs_map = GetAttrToInputsMap(*op_def);
+
   TF_Status* status = TF_NewStatus();
   TFE_Op* op = TFE_NewOp(op_exec_info.ctx, op_def->name().c_str(), status);
   auto cleaner = tensorflow::gtl::MakeCleanup([status, op] {
@@ -1986,17 +2205,16 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
 
       if (len > 0) {
         // First item adds the type attr.
-        if (!AddInputToOp(op_exec_info, PySequence_Fast_GET_ITEM(input, 0),
-                          &input_arg, flattened_attrs.get(),
+        if (!AddInputToOp(&op_exec_info, PySequence_Fast_GET_ITEM(input, 0),
+                          true, input_arg, flattened_attrs.get(),
                           flattened_inputs.get(), op, status)) {
           return nullptr;
         }
 
         for (Py_ssize_t j = 1; j < len; j++) {
           // Since the list is homogeneous, we don't need to re-add the attr.
-          if (!AddInputToOp(op_exec_info, PySequence_Fast_GET_ITEM(input, j),
-                            nullptr /* input_arg */,
-                            nullptr /* flattened_attrs */,
+          if (!AddInputToOp(&op_exec_info, PySequence_Fast_GET_ITEM(input, j),
+                            false, input_arg, nullptr /* flattened_attrs */,
                             flattened_inputs.get(), op, status)) {
             return nullptr;
           }
@@ -2018,7 +2236,8 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
         PyObject* py_input = PySequence_Fast_GET_ITEM(input, j);
         tensorflow::Safe_PyObjectPtr py_eager_tensor;
         if (!ConvertToTensor(op_exec_info, py_input, &py_eager_tensor,
-                             status)) {
+                             []() { Py_RETURN_NONE; },
+                             [](const TF_DataType& dtype) {}, status)) {
           return nullptr;
         }
 
@@ -2048,8 +2267,9 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
       attr_list_sizes[attr_name] = len;
     } else {
       // The item is a single item.
-      if (!AddInputToOp(op_exec_info, input, &input_arg, flattened_attrs.get(),
-                        flattened_inputs.get(), op, status)) {
+      if (!AddInputToOp(&op_exec_info, input, true, input_arg,
+                        flattened_attrs.get(), flattened_inputs.get(), op,
+                        status)) {
         return nullptr;
       }
     }
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 0bd5a5dbaf..b044b30231 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -278,14 +278,9 @@ class TFETensorUtilTest(test_util.TensorFlowTestCase):
 
     with self.assertRaisesRegexp(
         TypeError,
-        r"tensor_list argument must be a list. Got \"EagerTensor\""):
+        r"tensors argument must be a list or a tuple. Got \"EagerTensor\""):
       pywrap_tensorflow.TFE_Py_TensorShapeSlice(t1, -2)
 
-    with self.assertRaisesRegexp(
-        TypeError,
-        r"tensor_list argument must be a list. Got \"tuple\""):
-      pywrap_tensorflow.TFE_Py_TensorShapeSlice((t1,), -2)
-
   def testNegativeSliceDim(self):
     t1 = _create_tensor([1, 2], dtype=dtypes.int32)
 
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 662cda2a7d..8cd6820f6a 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -1385,6 +1385,22 @@ def register_tensor_conversion_function(base_type,
     if not callable(conversion_func):
       raise TypeError("conversion_func must be callable.")
 
+    # context._context is checked so that we don't inadvertently create it.
+    # This is because enable_eager_execution will fail when called from the main
+    # function if the context._context is already created, and the
+    # register_tensor_conversion_function calls happen when the module is
+    # imported.
+    if context._context is not None and context.executing_eagerly(
+    ) and isinstance(base_type, six.integer_types + (
+        float,
+        np.ndarray,
+    )):
+      # TODO(nareshmodi): consider setting a context variable which disables the
+      # fastpath instead.
+      raise TypeError(
+          "Cannot register conversions for numpy arrays, python number types "
+          "when executing eagerly.")
+
     try:
       funcs_at_priority = _tensor_conversion_func_registry[priority]
     except KeyError:
-- 
GitLab


From 76ea66f24d4370e6e7848b83fc0b571ba7edfa2d Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Fri, 20 Apr 2018 11:34:55 -0700
Subject: [PATCH 235/450] Move the guts of TFE_Op into EagerOperation

PiperOrigin-RevId: 193698320
---
 tensorflow/c/eager/BUILD                      |   2 +
 tensorflow/c/eager/c_api.cc                   | 230 +++++++++---------
 tensorflow/c/eager/c_api_internal.h           |  16 +-
 tensorflow/core/common_runtime/eager/BUILD    |  16 ++
 .../common_runtime/eager/eager_operation.cc   |  33 +++
 .../common_runtime/eager/eager_operation.h    |  74 ++++++
 6 files changed, 242 insertions(+), 129 deletions(-)
 create mode 100644 tensorflow/core/common_runtime/eager/eager_operation.cc
 create mode 100644 tensorflow/core/common_runtime/eager/eager_operation.h

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 3e14c10727..d66386acbd 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -51,6 +51,7 @@ tf_cuda_library(
         ],
         "//conditions:default": [],
     }) + [
+        "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core:gpu_runtime",
     ],
 )
@@ -73,6 +74,7 @@ tf_cuda_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/common_runtime/eager:eager_executor",
+        "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/common_runtime/eager:kernel_and_device",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
     ],
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 369342b142..b7a3097208 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -241,21 +241,18 @@ TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
 void TFE_DeleteOp(TFE_Op* op) { delete op; }
 
 void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) {
-  tensorflow::Device* d = nullptr;
-  if (device_name != nullptr && strlen(device_name) > 0) {
-    status->status = op->ctx->context.FindDeviceByName(device_name, &d);
-  }
-  op->device = d;
+  status->status = op->operation.SetDevice(device_name);
 }
 
 const char* TFE_OpGetDevice(TFE_Op* op, TF_Status* status) {
-  tensorflow::Device* device =
-      (op->device == nullptr) ? op->ctx->context.HostCPU() : op->device;
+  tensorflow::Device* device = (op->operation.Device() == nullptr)
+                                   ? op->operation.EagerContext()->HostCPU()
+                                   : op->operation.Device();
   return device->name().c_str();
 }
 
 void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) {
-  op->use_xla = enable;
+  op->operation.SetUseXla(enable);
 #ifndef TENSORFLOW_EAGER_USE_XLA
   LOG(WARNING) << "This call is a no-op, as the TensorFlow library is not "
                   "built with XLA support.";
@@ -263,22 +260,20 @@ void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) {
 }
 
 void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
-  h->handle->Ref();
-  op->inputs.push_back(h->handle);
-  op->attrs.NumInputs(op->inputs.size());
+  op->operation.AddInput(h->handle);
 }
 
 TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
                               unsigned char* is_list, TF_Status* status) {
   TF_AttrType ret;
-  if (op->is_function()) {
+  if (op->operation.is_function()) {
     status->status = tensorflow::errors::Unimplemented(
         "TODO(apassos): Support for attributes for TensorFlow functions is not "
         "ready yet.");
     return TF_ATTR_INT;  // The compiler requires that we return something.
   }
-  status->status =
-      tensorflow::AttrTypeByName(*op->attr_types, attr_name, &ret, is_list);
+  status->status = tensorflow::AttrTypeByName(*op->operation.AttrTypes(),
+                                              attr_name, &ret, is_list);
   return ret;
 }
 
@@ -297,23 +292,24 @@ TF_AttrType TFE_OpNameGetAttrType(TFE_Context* ctx,
 }
 
 void TFE_OpSetAttrString(TFE_Op* op, const char* attr_name, const char* value) {
-  op->attrs.Set(attr_name, value);
+  op->operation.MutableAttrs()->Set(attr_name, value);
 }
 
 void TFE_OpSetAttrInt(TFE_Op* op, const char* attr_name, int64_t value) {
-  op->attrs.Set(attr_name, static_cast<int64>(value));
+  op->operation.MutableAttrs()->Set(attr_name, static_cast<int64>(value));
 }
 
 void TFE_OpSetAttrFloat(TFE_Op* op, const char* attr_name, float value) {
-  op->attrs.Set(attr_name, value);
+  op->operation.MutableAttrs()->Set(attr_name, value);
 }
 
 void TFE_OpSetAttrBool(TFE_Op* op, const char* attr_name, unsigned char value) {
-  op->attrs.Set(attr_name, (value == 0) ? false : true);
+  op->operation.MutableAttrs()->Set(attr_name, (value == 0) ? false : true);
 }
 
 void TFE_OpSetAttrType(TFE_Op* op, const char* attr_name, TF_DataType value) {
-  op->attrs.Set(attr_name, static_cast<tensorflow::DataType>(value));
+  op->operation.MutableAttrs()->Set(attr_name,
+                                    static_cast<tensorflow::DataType>(value));
 }
 
 void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name, const int64_t* dims,
@@ -335,23 +331,24 @@ void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name, const int64_t* dims,
       proto.add_dim()->set_size(dims[d]);
     }
   }
-  op->attrs.Set(attr_name, proto);
+  op->operation.MutableAttrs()->Set(attr_name, proto);
 }
 
 void TFE_OpSetAttrFunction(TFE_Op* op, const char* attr_name,
                            const TFE_Op* value) {
   tensorflow::AttrValue attr_value;
   tensorflow::NameAttrList* func = attr_value.mutable_func();
-  func->set_name(value->name);
-  value->attrs.FillAttrValueMap(func->mutable_attr());
-  op->attrs.Set(attr_name, attr_value);
+  func->set_name(value->operation.Name());
+  value->operation.Attrs().FillAttrValueMap(func->mutable_attr());
+  op->operation.MutableAttrs()->Set(attr_name, attr_value);
 }
 
 #define TFE_OP_SET_ATTR_LIST(fn, type)                                \
   void fn(TFE_Op* op, const char* attr_name, const type* values,      \
           int num_values) {                                           \
-    op->attrs.Set(attr_name, tensorflow::gtl::ArraySlice<const type>( \
-                                 values, num_values));                \
+    op->operation.MutableAttrs()->Set(                                \
+        attr_name,                                                    \
+        tensorflow::gtl::ArraySlice<const type>(values, num_values)); \
   }
 TFE_OP_SET_ATTR_LIST(TFE_OpSetAttrStringList, char*)
 TFE_OP_SET_ATTR_LIST(TFE_OpSetAttrFloatList, float)
@@ -359,14 +356,14 @@ TFE_OP_SET_ATTR_LIST(TFE_OpSetAttrFloatList, float)
 
 void TFE_OpSetAttrIntList(TFE_Op* op, const char* attr_name,
                           const int64_t* values, int num_values) {
-  op->attrs.Set(attr_name,
-                tensorflow::gtl::ArraySlice<const int64>(
-                    reinterpret_cast<const int64*>(values), num_values));
+  op->operation.MutableAttrs()->Set(
+      attr_name, tensorflow::gtl::ArraySlice<const int64>(
+                     reinterpret_cast<const int64*>(values), num_values));
 }
 
 void TFE_OpSetAttrTypeList(TFE_Op* op, const char* attr_name,
                            const TF_DataType* values, int num_values) {
-  op->attrs.Set(
+  op->operation.MutableAttrs()->Set(
       attr_name,
       tensorflow::gtl::ArraySlice<const tensorflow::DataType>(
           reinterpret_cast<const tensorflow::DataType*>(values), num_values));
@@ -378,8 +375,8 @@ void TFE_OpSetAttrBoolList(TFE_Op* op, const char* attr_name,
   for (int i = 0; i < num_values; ++i) {
     b[i] = values[i];
   }
-  op->attrs.Set(attr_name,
-                tensorflow::gtl::ArraySlice<const bool>(b.get(), num_values));
+  op->operation.MutableAttrs()->Set(
+      attr_name, tensorflow::gtl::ArraySlice<const bool>(b.get(), num_values));
 }
 
 void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name,
@@ -409,9 +406,9 @@ void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name,
       }
     }
   }
-  op->attrs.Set(attr_name,
-                tensorflow::gtl::ArraySlice<tensorflow::TensorShapeProto>(
-                    proto.get(), num_values));
+  op->operation.MutableAttrs()->Set(
+      attr_name, tensorflow::gtl::ArraySlice<tensorflow::TensorShapeProto>(
+                     proto.get(), num_values));
 }
 
 void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
@@ -419,12 +416,12 @@ void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
   std::unique_ptr<tensorflow::NameAttrList[]> funcs(
       new tensorflow::NameAttrList[num_values]);
   for (int i = 0; i < num_values; i++) {
-    funcs[i].set_name(value[i]->name);
-    value[i]->attrs.FillAttrValueMap(funcs[i].mutable_attr());
+    funcs[i].set_name(value[i]->operation.Name());
+    value[i]->operation.Attrs().FillAttrValueMap(funcs[i].mutable_attr());
   }
-  op->attrs.Set(attr_name,
-                tensorflow::gtl::ArraySlice<const tensorflow::NameAttrList>(
-                    funcs.get(), num_values));
+  op->operation.MutableAttrs()->Set(
+      attr_name, tensorflow::gtl::ArraySlice<const tensorflow::NameAttrList>(
+                     funcs.get(), num_values));
 }
 }  // extern "C"
 
@@ -460,18 +457,19 @@ int StepStatsDeviceIndex(tensorflow::StepStats* step_stats,
 }
 
 tensorflow::Status ValidateInputTypeAndPlacement(
-    tensorflow::EagerContext* ctx, tensorflow::Device* op_device, TFE_Op* op,
-    const tensorflow::OpKernel* kernel, tensorflow::RunMetadata* run_metadata) {
+    tensorflow::EagerContext* ctx, tensorflow::Device* op_device,
+    tensorflow::EagerOperation* op, const tensorflow::OpKernel* kernel,
+    tensorflow::RunMetadata* run_metadata) {
   tensorflow::Device* host_device = ctx->HostCPU();
   const tensorflow::MemoryTypeVector& memtypes = kernel->input_memory_types();
-  if (memtypes.size() != op->inputs.size()) {
+  if (memtypes.size() != op->Inputs().size()) {
     return tensorflow::errors::InvalidArgument(
-        "expected ", memtypes.size(), " inputs, got ", op->inputs.size());
+        "expected ", memtypes.size(), " inputs, got ", op->Inputs().size());
   }
-  for (int i = 0; i < op->inputs.size(); ++i) {
+  for (int i = 0; i < op->Inputs().size(); ++i) {
     const tensorflow::Device* expected_device =
         memtypes[i] == tensorflow::HOST_MEMORY ? host_device : op_device;
-    tensorflow::TensorHandle* handle = op->inputs[i];
+    tensorflow::TensorHandle* handle = op->Inputs()[i];
     tensorflow::Device* handle_device = nullptr;
     TF_RETURN_IF_ERROR(handle->Device(&handle_device));
     const tensorflow::Device* actual_device =
@@ -491,7 +489,7 @@ tensorflow::Status ValidateInputTypeAndPlacement(
           return tensorflow::errors::InvalidArgument(
               "Tensors on conflicting devices:"
               " cannot compute ",
-              op->name, " as input #", i, " was expected to be on ",
+              op->Name(), " as input #", i, " was expected to be on ",
               expected_device->name(), " but is actually on ",
               actual_device->name(), " (operation running on ",
               op_device->name(), ")",
@@ -502,7 +500,7 @@ tensorflow::Status ValidateInputTypeAndPlacement(
               "between devices"
               " may slow down your model");
         case tensorflow::DEVICE_PLACEMENT_WARN:
-          LOG(WARNING) << "before computing " << op->name << " input #" << i
+          LOG(WARNING) << "before computing " << op->Name() << " input #" << i
                        << " was expected to be on " << expected_device->name()
                        << " but is actually on " << actual_device->name()
                        << " (operation running on " << op_device->name()
@@ -534,16 +532,16 @@ tensorflow::Status ValidateInputTypeAndPlacement(
         if (copied_tensor != nullptr) copied_tensor->Unref();
         return tensorflow::errors::Internal(
             "Failed copying input tensor from ", actual_device->name(), " to ",
-            expected_device->name(), " in order to run ", op->name, ": ",
+            expected_device->name(), " in order to run ", op->Name(), ": ",
             status.error_message());
       }
       handle->Unref();
       handle = copied_tensor;
-      op->inputs[i] = copied_tensor;
+      (*op->MutableInputs())[i] = copied_tensor;
     }
     if (handle->dtype != kernel->input_type(i)) {
       return tensorflow::errors::InvalidArgument(
-          "cannot compute ", op->name, " as input #", i,
+          "cannot compute ", op->Name(), " as input #", i,
           " was expected to be a ",
           tensorflow::DataTypeString(kernel->input_type(i)),
           " tensor but is a ", tensorflow::DataTypeString(handle->dtype),
@@ -554,9 +552,10 @@ tensorflow::Status ValidateInputTypeAndPlacement(
 }
 
 tensorflow::Device* SelectDevice(const tensorflow::NodeDef& ndef,
-                                 TFE_Context* ctx, TF_Status* status) {
+                                 tensorflow::EagerContext* ctx,
+                                 TF_Status* status) {
   tensorflow::DeviceSet ds;
-  for (tensorflow::Device* d : *ctx->context.devices()) {
+  for (tensorflow::Device* d : *ctx->devices()) {
     ds.AddDevice(d);
   }
   tensorflow::DeviceTypeVector final_devices;
@@ -570,7 +569,7 @@ tensorflow::Device* SelectDevice(const tensorflow::NodeDef& ndef,
         "Could not find valid device for node ", ndef.DebugString());
     return nullptr;
   }
-  for (tensorflow::Device* d : *ctx->context.devices()) {
+  for (tensorflow::Device* d : *ctx->devices()) {
     if (d->device_type() == final_devices[0].type_string()) {
       return d;
     }
@@ -599,15 +598,16 @@ const tensorflow::FunctionDef* OpToFunction(
     std::vector<TF_DataType>* arg_input_types,
     tensorflow::gtl::FlatMap<int, int>* op_input_to_func_input,
     TF_Status* status) {
-  DCHECK(!op->is_function());
+  DCHECK(!op->operation.is_function());
 
   tensorflow::FunctionDef fdef;
 
   // Get the OpDef of the op we are trying to encapsulate.
-  TFE_Context* ctx = op->ctx;
+  TFE_Context* ctx = op->operation.ctx;
   const tensorflow::OpRegistrationData* op_data;
   {
-    status->status = ctx->context.FindFunctionOpData(op->name, &op_data);
+    status->status =
+        ctx->context.FindFunctionOpData(op->operation.Name(), &op_data);
     if (!status->status.ok()) {
       return nullptr;
     }
@@ -618,7 +618,8 @@ const tensorflow::FunctionDef* OpToFunction(
 
   // Handle constant inputs.
   const std::unordered_set<string> const_inputs(
-      *tensorflow::XlaOpRegistry::CompileTimeConstantInputs(op->name));
+      *tensorflow::XlaOpRegistry::CompileTimeConstantInputs(
+          op->operation.Name()));
 
   // First add place holders for the input args, so that we can refer to them by
   // position in the next loop. Also tally up the resource inputs.
@@ -644,7 +645,7 @@ const tensorflow::FunctionDef* OpToFunction(
       (*op_input_to_func_input)[i] = const_index;
       func_input_arg = signature->mutable_input_arg(const_index++);
       const_input_types->push_back(
-          static_cast<TF_DataType>(op->inputs[i]->dtype));
+          static_cast<TF_DataType>(op->operation.Inputs()[i]->dtype));
     } else if (op_input_arg.type() == tensorflow::DT_RESOURCE) {
       VLOG(1) << "For resource input, mapping op input " << i
               << " to func input " << resource_index;
@@ -656,11 +657,11 @@ const tensorflow::FunctionDef* OpToFunction(
       (*op_input_to_func_input)[i] = arg_index;
       func_input_arg = signature->mutable_input_arg(arg_index++);
       arg_input_types->push_back(
-          static_cast<TF_DataType>(op->inputs[i]->dtype));
+          static_cast<TF_DataType>(op->operation.Inputs()[i]->dtype));
     }
 
     func_input_arg->set_name(op_input_arg.name());
-    func_input_arg->set_type(op->inputs[i]->dtype);
+    func_input_arg->set_type(op->operation.Inputs()[i]->dtype);
   }
   VLOG(1) << "Added OpDef Inputs: " << fdef.DebugString();
 
@@ -673,7 +674,8 @@ const tensorflow::FunctionDef* OpToFunction(
       op_def.name(), func_id_generator.fetch_add(1)));
 
   // Add the node def and set its input names to match op_def's names.
-  const tensorflow::NodeDef& ndef = op->attrs.BuildNodeDef();
+  const tensorflow::NodeDef& ndef =
+      op->operation.MutableAttrs()->BuildNodeDef();
   DCHECK_EQ(signature->input_arg_size(), ndef.input_size());
   *fdef.add_node_def() = ndef;
   for (int i = 0; i < op_def.input_arg_size(); ++i) {
@@ -713,17 +715,18 @@ const tensorflow::FunctionDef* OpToFunction(
 // Builds an _XLALaunchOp as a wrapper over 'op', so that 'op' can be executed
 // via XLA.
 std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
-  VLOG(1) << "Creating _XlaLaunchOp for TFE_Op " << op->name;
-  auto launch_op =
-      std::unique_ptr<TFE_Op>(TFE_NewOp(op->ctx, "_XlaLaunch", status));
+  VLOG(1) << "Creating _XlaLaunchOp for TFE_Op " << op->operation.Name();
+  auto launch_op = std::unique_ptr<TFE_Op>(
+      TFE_NewOp(op->operation.ctx, "_XlaLaunch", status));
   if (TF_GetCode(status) != TF_OK) return nullptr;
-  if (op->device) {
-    TFE_OpSetDevice(launch_op.get(), op->device->name().c_str(), status);
+  if (op->operation.device) {
+    TFE_OpSetDevice(launch_op.get(), op->operation.device->name().c_str(),
+                    status);
     if (TF_GetCode(status) != TF_OK) return nullptr;
   }
 
   const tensorflow::FunctionDef* fdef;
-  { fdef = op->ctx->context.FindFunctionDef(op->name); }
+  { fdef = op->operation.ctx->FindFunctionDef(op->operation.Name()); }
   std::vector<TF_DataType> const_input_types;
   std::vector<TF_DataType> arg_input_types;
   tensorflow::gtl::FlatMap<int, int> op_input_to_func_input;
@@ -748,20 +751,21 @@ std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
   // Copy inputs and their devices.
   // Since input param reordering may have occurred between `op` and `launch_op`
   // via `op_input_to_func_input`, adjust the actual inputs accordingly.
-  launch_op->inputs = op->inputs;
-  for (tensorflow::TensorHandle* h : launch_op->inputs) {
+  *launch_op->operation.MutableInputs() = op->operation.Inputs();
+  for (tensorflow::TensorHandle* h : launch_op->operation.Inputs()) {
     h->Ref();
   }
   if (!op_input_to_func_input.empty()) {
-    DCHECK_EQ(op->inputs.size(), op_input_to_func_input.size());
+    DCHECK_EQ(op->operation.Inputs().size(), op_input_to_func_input.size());
     for (int i = 0; i < op_input_to_func_input.size(); ++i) {
       VLOG(1) << "mapping op input " << i << " to func input "
               << op_input_to_func_input[i];
 
-      launch_op->inputs[op_input_to_func_input[i]] = op->inputs[i];
+      (*launch_op->operation.MuableInputs())[op_input_to_func_input[i]] =
+          op->operation.Inputs()[i];
     }
   }
-  launch_op->attrs.NumInputs(op->inputs.size());
+  launch_op->operation.MutableAttrs()->NumInputs(op->operation.Inputs().size());
 
   TFE_OpSetAttrTypeList(launch_op.get(), "Tconstants", const_input_types.data(),
                         const_input_types.size());
@@ -796,16 +800,17 @@ std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
 
 extern "C" {
 
-void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
+void TFE_Execute(TFE_Op* tfe_op, TFE_TensorHandle** retvals, int* num_retvals,
                  TF_Status* status) {
-  TFE_Context* ctx = op->ctx;
-  status->status = ctx->context.GetStatus();
+  tensorflow::EagerOperation* op = &tfe_op->operation;
+  tensorflow::EagerContext* ctx = op->EagerContext();
+  status->status = ctx->GetStatus();
   if (!status->status.ok()) {
     return;
   }
 #ifdef TENSORFLOW_EAGER_USE_XLA
   std::unique_ptr<TFE_Op> xla_launch_op;
-  if (op->use_xla && op->name != "_XlaLaunch") {
+  if (op->UseXla() && op->Name() != "_XlaLaunch") {
     xla_launch_op = BuildXlaLaunch(op, status);
     if (!status->status.ok()) {
       return;
@@ -816,31 +821,31 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
   // Ensure all resource-touching ops run in the device the resource is,
   // regardless of anything else that has been specified. This is identical to
   // the graph mode behavior.
-  for (int i = 0; i < op->inputs.size(); ++i) {
+  for (int i = 0; i < op->Inputs().size(); ++i) {
     tensorflow::Device* input_op_device = nullptr;
-    status->status = op->inputs[i]->OpDevice(&input_op_device);
+    status->status = op->Inputs()[i]->OpDevice(&input_op_device);
     if (!status->status.ok()) return;
-    VLOG(2) << "for op " << op->name << " input " << i << " "
-            << tensorflow::DataTypeString(op->inputs[i]->dtype) << " "
+    VLOG(2) << "for op " << op->Name() << " input " << i << " "
+            << tensorflow::DataTypeString(op->Inputs()[i]->dtype) << " "
             << (input_op_device == nullptr ? "cpu" : input_op_device->name())
-            << " " << (op->device == nullptr ? "cpu" : op->device->name());
-    if (op->inputs[i]->dtype == tensorflow::DT_RESOURCE &&
-        (input_op_device != op->device || input_op_device == nullptr)) {
+            << " " << (op->Device() == nullptr ? "cpu" : op->Device()->name());
+    if (op->Inputs()[i]->dtype == tensorflow::DT_RESOURCE &&
+        (input_op_device != op->Device() || input_op_device == nullptr)) {
       tensorflow::Device* d =
-          input_op_device == nullptr ? ctx->context.HostCPU() : input_op_device;
-      VLOG(1) << "Changing device of operation " << op->name << " to "
+          input_op_device == nullptr ? ctx->HostCPU() : input_op_device;
+      VLOG(1) << "Changing device of operation " << op->Name() << " to "
               << d->name() << " because input #" << i
               << " is a resource in this device.";
-      op->device = d;
+      op->SetDevice(d);
     }
   }
-  tensorflow::Device* device = op->device;
+  tensorflow::Device* device = op->Device();
 
-  tensorflow::Fprint128 cache_key =
-      op->attrs.CacheKey(device == nullptr ? "unspecified" : device->name());
-  tensorflow::KernelAndDevice* kernel = ctx->context.GetCachedKernel(cache_key);
+  tensorflow::Fprint128 cache_key = op->MutableAttrs()->CacheKey(
+      device == nullptr ? "unspecified" : device->name());
+  tensorflow::KernelAndDevice* kernel = ctx->GetCachedKernel(cache_key);
   if (kernel == nullptr) {
-    const tensorflow::NodeDef& ndef = op->attrs.BuildNodeDef();
+    const tensorflow::NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
     if (device == nullptr) {
       device = SelectDevice(ndef, ctx, status);
       if (!status->status.ok()) {
@@ -848,19 +853,19 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
       }
     }
     CHECK(device != nullptr);
-    if (ctx->context.LogDevicePlacement()) {
+    if (ctx->LogDevicePlacement()) {
       LOG(INFO) << "Executing op " << ndef.op() << " in device "
                 << device->name();
     }
-    kernel = new tensorflow::KernelAndDevice(ctx->context.GetRendezvous());
+    kernel = new tensorflow::KernelAndDevice(ctx->GetRendezvous());
     // Knowledge of the implementation of Init (and in-turn
     // FunctionLibraryRuntime::CreateKernel) tells us that ctx->func_lib_def
     // will be accessed, so grab on to the lock.
     // See WARNING comment in Execute (before kernel->Run) - would be nice to
     // rework to avoid this subtlety.
-    tensorflow::tf_shared_lock l(*ctx->context.FunctionsMu());
-    status->status = tensorflow::KernelAndDevice::Init(
-        ndef, ctx->context.func_lib(device), kernel);
+    tensorflow::tf_shared_lock l(*ctx->FunctionsMu());
+    status->status =
+        tensorflow::KernelAndDevice::Init(ndef, ctx->func_lib(device), kernel);
     if (!status->status.ok()) {
       delete kernel;
       return;
@@ -868,7 +873,7 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
     // Update output_dtypes inside `kernel`.
     const tensorflow::OpDef* op_def = nullptr;
     const tensorflow::FunctionDef* function_def =
-        ctx->context.FuncLibDef()->Find(ndef.op());
+        ctx->FuncLibDef()->Find(ndef.op());
     if (function_def != nullptr) {
       op_def = &(function_def->signature());
     }
@@ -884,7 +889,7 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
     if (!status->status.ok()) {
       return;
     }
-    ctx->context.AddKernelToCache(cache_key, kernel);
+    ctx->AddKernelToCache(cache_key, kernel);
   }
   const tensorflow::DataTypeVector& output_dtypes = kernel->output_dtypes();
   const int output_dtypes_size = output_dtypes.size();
@@ -903,43 +908,42 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
     device = kernel->device();
   }
   status->status = ValidateInputTypeAndPlacement(
-      &ctx->context, device, op, kernel->kernel(),
-      ctx->context.ShouldStoreMetadata() ? ctx->context.RunMetadataProto()
-                                         : nullptr);
+      ctx, device, op, kernel->kernel(),
+      ctx->ShouldStoreMetadata() ? ctx->RunMetadataProto() : nullptr);
   if (!status->status.ok()) return;
   std::unique_ptr<tensorflow::NodeExecStats> maybe_stats;
-  if (ctx->context.ShouldStoreMetadata()) {
+  if (ctx->ShouldStoreMetadata()) {
     maybe_stats.reset(new tensorflow::NodeExecStats);
-    maybe_stats->set_node_name(op->name);
+    maybe_stats->set_node_name(op->Name());
     maybe_stats->set_all_start_micros(tensorflow::Env::Default()->NowMicros());
     maybe_stats->set_op_start_rel_micros(0);
     maybe_stats->set_scheduled_micros(tensorflow::Env::Default()->NowMicros());
     // TODO(apassos) track referenced tensors
   }
-  if (ctx->context.Async()) {
+  if (ctx->Async()) {
     // Note that for async mode, execution order will make sure that all
     // input handles are ready before executing them.
     // TODO(agarwal): Consider executing "cheap" kernels inline for performance.
     tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> handle_retvals(
         *num_retvals);
-    tensorflow::uint64 id = op->ctx->context.NextId();
+    tensorflow::uint64 id = ctx->NextId();
     for (int i = 0; i < *num_retvals; ++i) {
       tensorflow::TensorHandle* h =
-          new tensorflow::TensorHandle(id, output_dtypes[i], &op->ctx->context);
+          new tensorflow::TensorHandle(id, output_dtypes[i], ctx);
       retvals[i] = new TFE_TensorHandle(h);
       handle_retvals[i] = h;
     }
     tensorflow::EagerNode* node = new tensorflow::ExecuteNode(
-        id, &op->ctx->context, op->device, op->inputs, kernel,
-        maybe_stats.release(), output_dtypes, handle_retvals);
-    ctx->context.ExecutorAdd(node);
+        id, ctx, op->Device(), op->Inputs(), kernel, maybe_stats.release(),
+        output_dtypes, handle_retvals);
+    ctx->ExecutorAdd(node);
   } else {
     // Execute checks if retvals[i] is nullptr or not to figure if it needs to
     // allocate it.
     tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> handle_retvals(
         *num_retvals);
     status->status = tensorflow::EagerExecute(
-        &op->ctx->context, op->device, op->inputs, kernel, maybe_stats.get(),
+        ctx, op->Device(), op->Inputs(), kernel, maybe_stats.get(),
         handle_retvals.data(), *num_retvals);
     for (int i = 0; i < *num_retvals; ++i) {
       retvals[i] = new TFE_TensorHandle(handle_retvals[i]);
@@ -1142,9 +1146,3 @@ void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
   }
 }
 }  // namespace tensorflow
-
-TFE_Op::~TFE_Op() {
-  for (tensorflow::TensorHandle* h : inputs) {
-    h->Unref();
-  }
-}
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 05dc64f521..49e1aab1ce 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -45,7 +46,6 @@ limitations under the License.
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/public/version.h"
 
-
 struct TFE_ContextOptions {
   TF_SessionOptions session_options;
   // true if async execution is enabled.
@@ -85,19 +85,9 @@ struct TFE_Op {
   // t is NULL iff the TFE_Op corresponds to a TensorFlow function instead of a
   // primitive operation.
   TFE_Op(TFE_Context* ctx, const char* op, const tensorflow::AttrTypeMap* t)
-      : ctx(ctx), name(op), attrs(op), attr_types(t), device(nullptr) {}
-
-  ~TFE_Op();
-
-  bool const is_function() const { return attr_types == nullptr; }
+      : operation(&ctx->context, op, t) {}
 
-  TFE_Context* ctx;  // Must outlive the TFE_Op.
-  const tensorflow::string name;
-  tensorflow::AttrBuilder attrs;
-  const tensorflow::AttrTypeMap* attr_types;
-  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 4> inputs;
-  tensorflow::Device* device;
-  bool use_xla = false;
+  tensorflow::EagerOperation operation;
 };
 
 namespace tensorflow {
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 941a0e61c7..00ac4a4e47 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -54,6 +54,22 @@ tf_cuda_library(
     ],
 )
 
+tf_cuda_library(
+    name = "eager_operation",
+    srcs = [
+        "eager_operation.cc",
+    ],
+    hdrs = [
+        "eager_operation.h",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":context",
+        ":tensor_handle",
+        "//tensorflow/c/eager:runtime",
+    ],
+)
+
 tf_cuda_library(
     name = "tensor_handle",
     srcs = [
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc
new file mode 100644
index 0000000000..381b05ada8
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/eager_operation.cc
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+
+namespace tensorflow {
+tensorflow::Status EagerOperation::SetDevice(const char* device) {
+  auto status = Status::OK();
+  tensorflow::Device* d = nullptr;
+  if (device != nullptr && strlen(device) > 0) {
+    status.Update(ctx_->FindDeviceByName(device, &d));
+  }
+  device_ = d;
+  return status;
+}
+
+void EagerOperation::AddInput(tensorflow::TensorHandle* h) {
+  h->Ref();
+  inputs_.push_back(h);
+  attrs_.NumInputs(static_cast<int>(inputs_.size()));
+}
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
new file mode 100644
index 0000000000..6b6e53da87
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -0,0 +1,74 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
+
+#include "tensorflow/c/eager/runtime.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+
+namespace tensorflow {
+class EagerOperation {
+ public:
+  // t is NULL iff the EagerOperation corresponds to a TensorFlow function
+  // instead of a primitive operation.
+  EagerOperation(tensorflow::EagerContext* ctx, const char* op,
+                 const tensorflow::AttrTypeMap* t)
+      : ctx_(ctx), name_(op), attrs_(op), attr_types_(t), device_(nullptr) {}
+
+  ~EagerOperation() {
+    for (tensorflow::TensorHandle* h : inputs_) {
+      h->Unref();
+    }
+  }
+
+  bool is_function() const { return attr_types_ == nullptr; }
+
+  tensorflow::EagerContext* EagerContext() { return ctx_; }
+
+  tensorflow::AttrBuilder* MutableAttrs() { return &attrs_; }
+  const tensorflow::AttrBuilder& Attrs() const { return attrs_; }
+
+  const tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 4>& Inputs()
+      const {
+    return inputs_;
+  }
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 4>*
+  MutableInputs() {
+    return &inputs_;
+  }
+  void AddInput(tensorflow::TensorHandle* h);
+
+  const tensorflow::string& Name() const { return name_; }
+  const tensorflow::AttrTypeMap* AttrTypes() const { return attr_types_; }
+
+  tensorflow::Device* Device() const { return device_; }
+  tensorflow::Status SetDevice(const char* device);
+  void SetDevice(tensorflow::Device* device) { device_ = device; }
+
+  void SetUseXla(bool use_xla) { use_xla_ = use_xla; }
+
+ private:
+  tensorflow::EagerContext* ctx_;  // Must outlive the EagerOperation.
+  const tensorflow::string name_;
+  tensorflow::AttrBuilder attrs_;
+  const tensorflow::AttrTypeMap* attr_types_;
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 4> inputs_;
+  tensorflow::Device* device_;
+  bool use_xla_ = false;
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
-- 
GitLab


From 2b0b015ebb1c33a409836bd1c9c98124dfd841ec Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 11:43:48 -0700
Subject: [PATCH 236/450] [XLA] Fix a bug in ToProto: don't add gather
 attributes twice.

PiperOrigin-RevId: 193699745
---
 tensorflow/compiler/xla/service/hlo_instruction.cc | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index a638d54d85..a714d0e114 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -2451,12 +2451,6 @@ HloInstructionProto HloInstruction::ToProto() const {
     proto.add_fft_length(fft_len);
   }
 
-  if (gather_dimension_numbers_ != nullptr) {
-    *proto.mutable_gather_dimension_numbers() = *gather_dimension_numbers_;
-  }
-  for (int64 bound : gather_window_bounds_) {
-    proto.add_gather_window_bounds(bound);
-  }
   proto.set_channel_name(channel_name_);
   proto.set_cost_estimate_ns(cost_estimate_ns_);
 
-- 
GitLab


From 0074dffd076e0faf4da5913aebfa594ef925d6c7 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Fri, 20 Apr 2018 12:01:21 -0700
Subject: [PATCH 237/450] Prefix compat import with underscore in
 meta_graph_transform.py so that it doesn't get exported as part of API:
 https://www.tensorflow.org/versions/r1.8/api_docs/python/tf/contrib/meta_graph_transform/meta_graph_transform

PiperOrigin-RevId: 193702570
---
 .../meta_graph_transform/meta_graph_transform.py       | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
index ff88b4fa84..4090c1ff3e 100644
--- a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
+++ b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
@@ -30,7 +30,7 @@ from tensorflow.python.framework import importer as _importer
 from tensorflow.python.framework import ops as _ops
 from tensorflow.python.saved_model import constants as _saved_model_constants
 from tensorflow.python.training import saver as _saver_lib
-from tensorflow.python.util import compat
+from tensorflow.python.util import compat as _compat
 from tensorflow.tools import graph_transforms as _graph_transforms
 
 
@@ -161,7 +161,7 @@ def _clean_save_and_restore(graph_def, op, removed_op_names):
   shapes = []
   dtypes = []
   for index, value in enumerate(name_op_value_tensor.string_val):
-    if not _is_removed(compat.as_str(value), removed_op_names):
+    if not _is_removed(_compat.as_str(value), removed_op_names):
       names.append(value)
       shapes.append(shape_op_value_tensor.string_val[index])
       dtypes.append(op.attr['dtypes'].list.type[index])
@@ -651,7 +651,7 @@ def _is_removed_mentioned(s, removed_op_names):
   # /foo/bar. This regex ensures that we handle these two nodes
   # as separate entities.  It matches on nodes having names in the form of
   # '/foo/bar_x' as well as nodes having names in the form of 'foo.'
-  s_names = _re.findall(r'((?:[\/]?[a-zA-Z0-9\_]*)*)', compat.as_str_any(s))
+  s_names = _re.findall(r'((?:[\/]?[a-zA-Z0-9\_]*)*)', _compat.as_str_any(s))
   for removed_op_name in removed_op_names:
     for s_name in s_names:
       if s_name.endswith(removed_op_name):
@@ -737,9 +737,9 @@ def meta_graph_transform(
   for tag in tags:
     meta_graph_def.meta_info_def.tags.append(tag)
 
-  base_op_names = [compat.as_str(node.name)
+  base_op_names = [_compat.as_str(node.name)
                    for node in base_meta_graph_def.graph_def.node]
-  retained_op_names = [compat.as_str(node.name)
+  retained_op_names = [_compat.as_str(node.name)
                        for node in meta_graph_def.graph_def.node]
   removed_op_names = set(base_op_names) - set(retained_op_names)
 
-- 
GitLab


From 1b5839e6acad5d360ea9e5b94226b30047924cb9 Mon Sep 17 00:00:00 2001
From: Dimitris Vardoulakis <dimvar@google.com>
Date: Fri, 20 Apr 2018 12:02:56 -0700
Subject: [PATCH 238/450] [TF:XLA] Now that the compiler no longer introduces
 implicit broadcasts, forbid them in the HLO verifier.

PiperOrigin-RevId: 193702874
---
 tensorflow/compiler/xla/service/BUILD         |  1 +
 .../compiler/xla/service/hlo_verifier.cc      | 21 ++++++++
 .../compiler/xla/service/hlo_verifier.h       |  4 ++
 .../xla/service/reshape_mover_test.cc         | 51 -------------------
 4 files changed, 26 insertions(+), 51 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 9009cbf845..9555d91817 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -2032,6 +2032,7 @@ cc_library(
     srcs = ["hlo_verifier.cc"],
     hdrs = ["hlo_verifier.h"],
     deps = [
+        ":hlo",
         ":hlo_pass",
         ":shape_inference",
         "//tensorflow/compiler/xla:status_macros",
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 80ed6d6832..8a30cbf9cd 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <set>
 
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -780,6 +781,24 @@ Status HloVerifier::CheckWhileInstruction(HloInstruction* instruction) {
   return tensorflow::Status::OK();
 }
 
+Status HloVerifier::CheckElementwiseInstruction(HloInstruction* instruction) {
+  const Shape& out_shape = instruction->shape();
+  for (HloInstruction* operand : instruction->operands()) {
+    const Shape& operand_shape = operand->shape();
+    if (!ShapeUtil::IsScalar(operand_shape) &&
+        !ShapeUtil::CompatibleIgnoringElementType(operand_shape, out_shape)) {
+      return FailedPrecondition(
+          "Implicit broadcast is not allowed in HLO."
+          "Found non-compatible shapes for instruction %s.\n"
+          "output: %s\noperand: %s\n",
+          HloOpcodeString(instruction->opcode()).c_str(),
+          ShapeUtil::HumanString(out_shape).c_str(),
+          ShapeUtil::HumanString(operand_shape).c_str());
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
 StatusOr<bool> HloVerifier::Run(HloModule* module) {
   TF_RETURN_IF_ERROR(VerifyHloStructure(module));
 
@@ -821,6 +840,8 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
             << " != " << ShapeUtil::Rank(instruction->operand(0)->shape());
       } else if (instruction->opcode() == HloOpcode::kWhile) {
         TF_RETURN_IF_ERROR(CheckWhileInstruction(instruction));
+      } else if (instruction->IsElementwise()) {
+        TF_RETURN_IF_ERROR(CheckElementwiseInstruction(instruction));
       }
 
       auto previous = instructions.find(instruction->name());
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 1ec55a9bdc..6208887547 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -146,6 +146,10 @@ class HloVerifier : public HloPassInterface {
 
   Status CheckWhileInstruction(HloInstruction* instruction);
 
+  // Checks that the non-scalar operand shapes are compatible to the output
+  // shape, i.e., that there are no implicit broadcasts of size-one dimensions.
+  Status CheckElementwiseInstruction(HloInstruction* instruction);
+
   // Creates a ShapeVerifier that checks that shapes match inferred
   // expectations. This is a factory function because ShapeVerifier,
   // being a DfsHloVisitor, is stateful. We want a clean object
diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc
index 094f7319f4..13e2d3258e 100644
--- a/tensorflow/compiler/xla/service/reshape_mover_test.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc
@@ -458,57 +458,6 @@ TEST_F(ReshapeMoverTest, ScalarReshapeNotMovedAcrossSelect) {
   EXPECT_EQ(select, computation->root_instruction());
 }
 
-// Tree looks like:
-//
-// param0 [1,128,1]
-//  |
-// reshape [128,1]          constant [128,1024]
-//   \                         /
-//     multiply w/implicit broadcast [128,1024]
-//
-// The reshape mover would like to sink the reshape below the multiply.
-//
-// Previously we would attempt to insert a reshape of the constant to [1,128,1]
-// (which is unsound, because it has a different number of elements) as
-// preparation for sinking the reshape.
-//
-// To eliminate the unsoundness, we outlaw reshape sinking when one of the
-// operands is implicitly broadcast in the elementwise consumer.
-//
-// TODO(b/37799338) However, it would be possible in this case to do a more
-// in-depth analysis to get reshape movement to occur:
-//
-// 1. Note that the broadcast dimension (logical dimension 1) in the operands
-//    would map back to logical dimension 2 in the param0 node.
-// 2. Match rank of the constant to the param0 node (by prepending a trivial 1
-//    dimension).
-// 3. Reshape to [128,1024] at the root.
-//
-// But this is not currently done.
-TEST_F(ReshapeMoverTest, ImplicitlyBroadcastReshapeIsNotMovedBug37787999) {
-  HloComputation::Builder builder(TestName());
-  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {1, 128, 1}), "param0"));
-  auto reshape = builder.AddInstruction(HloInstruction::CreateReshape(
-      ShapeUtil::MakeShape(F32, {128, 1}), param0));
-  Array2D<float> a(128, 1024);
-  auto literal = Literal::CreateR2FromArray2D<float>(a);
-  auto constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(std::move(literal)));
-  auto multiply = builder.AddInstruction(HloInstruction::CreateBinary(
-      constant->shape(), HloOpcode::kMultiply, constant, reshape));
-
-  auto computation = module().AddEntryComputation(builder.Build());
-  EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Constant(), op::Reshape(param0)));
-
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Constant(), op::Reshape(param0)));
-  EXPECT_EQ(multiply, computation->root_instruction());
-}
-
 // Tree looks like this:
 //
 // add1
-- 
GitLab


From ceed923d600584ade8d159271422b4a08f728cbb Mon Sep 17 00:00:00 2001
From: Yangzihao Wang <yangzihao@google.com>
Date: Fri, 20 Apr 2018 12:05:11 -0700
Subject: [PATCH 239/450] Add native dilated support for conv3d and its
 gradients in cudnn v>=6.

PiperOrigin-RevId: 193703316
---
 tensorflow/core/framework/common_shape_fns.cc |  32 ++-
 .../core/framework/common_shape_fns_test.cc   |  55 ++++-
 tensorflow/core/kernels/conv_grad_ops_3d.cc   | 115 +++++++++-
 tensorflow/core/kernels/conv_ops_3d.cc        |  52 ++++-
 tensorflow/core/ops/nn_ops.cc                 |   2 +
 .../python/kernel_tests/conv_ops_3d_test.py   | 196 +++++++++++++++++-
 tensorflow/python/ops/nn_grad.py              |   6 +
 7 files changed, 426 insertions(+), 32 deletions(-)

diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 72eeda7a43..0916c9b7a8 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -487,6 +487,15 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
   string data_format;
   Status s = c->GetAttr("data_format", &data_format);
 
+  std::vector<int32> dilations;
+  TF_RETURN_IF_ERROR(c->GetAttr("dilations", &dilations));
+
+  if (dilations.size() != 5) {
+    return errors::InvalidArgument(
+        "Conv3D requires the dilation attribute to contain 5 values, but got: ",
+        dilations.size());
+  }
+
   std::vector<int32> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
   if (strides.size() != 5) {
@@ -496,6 +505,7 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
   }
 
   int32 stride_planes, stride_rows, stride_cols;
+  int32 dilation_planes, dilation_rows, dilation_cols;
   if (s.ok() && data_format == "NCDHW") {
     // Convert input_shape to NDHWC.
     auto dim = [&](char dimension) {
@@ -506,10 +516,16 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
     stride_planes = strides[2];
     stride_rows = strides[3];
     stride_cols = strides[4];
+    dilation_planes = dilations[2];
+    dilation_cols = dilations[3];
+    dilation_rows = dilations[4];
   } else {
     stride_planes = strides[1];
     stride_rows = strides[2];
     stride_cols = strides[3];
+    dilation_planes = dilations[1];
+    dilation_cols = dilations[2];
+    dilation_rows = dilations[3];
   }
 
   DimensionHandle batch_size_dim = c->Dim(input_shape, 0);
@@ -530,13 +546,15 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
   DimensionHandle output_planes, output_rows, output_cols;
 
-  TF_RETURN_IF_ERROR(
-      GetWindowedOutputSizeFromDims(c, in_planes_dim, filter_planes_dim,
-                                    stride_planes, padding, &output_planes));
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
-      c, in_rows_dim, filter_rows_dim, stride_rows, padding, &output_rows));
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
-      c, in_cols_dim, filter_cols_dim, stride_cols, padding, &output_cols));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, in_planes_dim, filter_planes_dim, dilation_planes, stride_planes,
+      padding, &output_planes));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, in_rows_dim, filter_rows_dim, dilation_rows, stride_rows, padding,
+      &output_rows));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, in_cols_dim, filter_cols_dim, dilation_cols, stride_cols, padding,
+      &output_cols));
 
   ShapeHandle output_shape;
   if (data_format == "NCDHW") {
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index 13d429b895..919e0967c0 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -644,15 +644,19 @@ TEST(CommonShapeFnsTest, Conv3DShapeTest) {
                     .Finalize(&op.node_def));
   };
 
-  // 1x1x1 filter
-  set_op({{1, 1, 1, 1, 1}}, "VALID");
-  INFER_OK(op, "[1,2,2,2,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]");
-
   // Invalid rank for input
   INFER_ERROR("must be rank 5", op, "[4,4];[2,1,1,1]");
   // Invalid rank for filter
   INFER_ERROR("must be rank 5", op, "[1,4,4,1];[2,1,1]");
 
+  // Invalid value for strides
+  set_op({{1, 1, 1, 0, 1}}, "VALID");
+  INFER_ERROR("must be > 0", op, "[1,2,2,2,1];[1,1,1,1,1]");
+
+  // 1x1x1 filter
+  set_op({{1, 1, 1, 1, 1}}, "VALID");
+  INFER_OK(op, "[1,2,2,2,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]");
+
   // unknown dims in the critical fields give partial inference.
   INFER_OK(op, "[1,2,2,2,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]");
   INFER_OK(op, "[1,?,2,2,1];[1,1,1,1,1]", "[d0_0,?,2,2,d1_4]");
@@ -712,6 +716,49 @@ TEST(CommonShapeFnsTest, Conv3DShapeTest) {
   INFER_OK(op, "[1,4,9,4,1];[2,2,2,1,?]", "[d0_0,2,3,1,d1_4]");
 }
 
+TEST(CommonShapeFnsTest, Conv3DDilatedShapeTest) {
+  ShapeInferenceTestOp op("Conv3D");
+  auto set_op = [&op](const std::vector<int32>& dilations,
+                      const std::vector<int32>& strides,
+                      const string& padding) {
+    TF_CHECK_OK(NodeDefBuilder("test", "Conv3D")
+                    .Input("input", 0, DT_FLOAT)
+                    .Input("filter", 0, DT_FLOAT)
+                    .Attr("dilations", dilations)
+                    .Attr("strides", strides)
+                    .Attr("padding", padding)
+                    .Finalize(&op.node_def));
+  };
+
+  // Invalid rank for dilation
+  set_op({{1, 2, 1, 1}}, {{1, 1, 1, 1, 1}}, "VALID");
+  INFER_ERROR("contain 5 values", op, "[1,2,2,2,1];[1,1,1,1,1]");
+
+  // Invalid value for dilation
+  set_op({{1, 2, 0, 1, 1}}, {{1, 1, 1, 1, 1}}, "VALID");
+  INFER_ERROR("must be >= 1", op, "[1,2,2,2,1];[1,1,1,1,1]");
+
+  // 2x1x1 dilation 1x1x1 filter
+  set_op({{1, 2, 1, 1, 1}}, {{1, 1, 1, 1, 1}}, "VALID");
+  INFER_OK(op, "[1,2,2,2,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]");
+
+  // 2x1x1 dilation 2x2x2 filter
+  set_op({{1, 2, 1, 1, 1}}, {{1, 1, 1, 1, 1}}, "VALID");
+  INFER_OK(op, "[1,3,2,2,1];[2,2,2,1,1]", "[d0_0,1,1,1,d1_4]");
+
+  // 2x1x1 dilation 3x3x3 input, 1x1x1 filter, 2x2x2 stride
+  set_op({{1, 2, 1, 1, 1}}, {{1, 2, 2, 2, 1}}, "VALID");
+  INFER_OK(op, "[1,3,3,3,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]");
+
+  // 2x1x1 dilation 3x3x3 input, 1x1x1 filter, 2x1x1 stride
+  set_op({{1, 2, 1, 1, 1}}, {{1, 2, 1, 1, 1}}, "VALID");
+  INFER_OK(op, "[1,3,3,3,1];[1,1,1,1,1]", "[d0_0,2,3,3,d1_4]");
+
+  // 2x1x1 dilation 4x4x4 input, 2x2x2 filter, 1x1x1 stride
+  set_op({{1, 2, 1, 1, 1}}, {{1, 1, 1, 1, 1}}, "SAME");
+  INFER_OK(op, "[1,4,4,4,1];[2,2,2,1,1]", "[d0_0,d0_1,d0_2,d0_3,d1_4]");
+}
+
 TEST(CommonShapeFnsTest, DepthwiseConv2DShapeTest) {
   ShapeInferenceTestOp op("DepthwiseConv2dNative");
   std::vector<int32> strides = {{1, 1, 1, 1}};
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 1234997bc5..092e859a5b 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -79,13 +79,18 @@ typedef Eigen::GpuDevice GPUDevice;
       context, out_depth == GetTensorDim(out_backprop, data_format_, 'C'),     \
       errors::InvalidArgument(                                                 \
           label, ": filter and out_backprop must have the same out_depth"));   \
+  const std::array<int64, 3> dilations = {                                     \
+      {GetTensorDim(dilation_, data_format_, '0'),                             \
+       GetTensorDim(dilation_, data_format_, '1'),                             \
+       GetTensorDim(dilation_, data_format_, '2')}};                           \
   const std::array<int64, 3> strides = {                                       \
       {GetTensorDim(stride_, data_format_, '0'),                               \
        GetTensorDim(stride_, data_format_, '1'),                               \
        GetTensorDim(stride_, data_format_, '2')}};                             \
   std::array<int64, 3> out, padding;                                           \
-  OP_REQUIRES_OK(context, Get3dOutputSize(input_size, filter_size, strides,    \
-                                          padding_, &out, &padding));          \
+  OP_REQUIRES_OK(                                                              \
+      context, Get3dOutputSizeV2(input_size, filter_size, dilations, strides,  \
+                                 padding_, &out, &padding));                   \
   OP_REQUIRES(context, output_planes == out[0],                                \
               errors::InvalidArgument(                                         \
                   label,                                                       \
@@ -151,6 +156,26 @@ class Conv3DBackpropInputOp : public OpKernel {
               "Conv3DBackpropInputOpV2 only supports NDHWC on the CPU."));
     }
 
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+    OP_REQUIRES(context, dilation_.size() == 5,
+                errors::InvalidArgument("Dilation rates field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
+                 GetTensorDim(dilation_, data_format_, 'N') == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilation rates in the batch and depth dimensions."));
+
+    // TODO(yangzihao): Add CPU version of dilated conv 3D.
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, '0') == 1 &&
+                 GetTensorDim(dilation_, data_format_, '1') == 1 &&
+                 GetTensorDim(dilation_, data_format_, '2') == 1),
+                errors::InvalidArgument(
+                    "Current CPU implementation does not yet support "
+                    "dilation rates larger than 1."));
+
     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
     OP_REQUIRES(context, stride_.size() == 5,
                 errors::InvalidArgument("Sliding window strides field must "
@@ -223,6 +248,7 @@ class Conv3DBackpropInputOp : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilation_;
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
@@ -261,6 +287,26 @@ class Conv3DBackpropFilterOp : public OpKernel {
               "Conv3DBackpropFilterOpV2 only supports NDHWC on the CPU."));
     }
 
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+    OP_REQUIRES(context, dilation_.size() == 5,
+                errors::InvalidArgument("Dilation rates field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
+                 GetTensorDim(dilation_, data_format_, 'N') == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilation rates in the batch and depth dimensions."));
+
+    // TODO(yangzihao): Add CPU version of dilated conv 3D.
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, '0') == 1 &&
+                 GetTensorDim(dilation_, data_format_, '1') == 1 &&
+                 GetTensorDim(dilation_, data_format_, '2') == 1),
+                errors::InvalidArgument(
+                    "Current CPU implementation does not yet support "
+                    "dilation rates larger than 1."));
+
     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
     OP_REQUIRES(context, stride_.size() == 5,
                 errors::InvalidArgument("Sliding window strides field must "
@@ -370,6 +416,7 @@ class Conv3DBackpropFilterOp : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilation_;
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
@@ -438,6 +485,22 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
       OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                   errors::InvalidArgument("Invalid data format"));
     }
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+    OP_REQUIRES(context, dilation_.size() == 5,
+                errors::InvalidArgument("Dilation rates field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
+                 GetTensorDim(dilation_, data_format_, 'N') == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilation rates in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(dilation_, data_format_, '0') > 0 &&
+         GetTensorDim(dilation_, data_format_, '1') > 0 &&
+         GetTensorDim(dilation_, data_format_, '2') > 0),
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
     OP_REQUIRES(context, stride_.size() == 5,
                 errors::InvalidArgument("Sliding window strides field must "
@@ -448,6 +511,12 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
          GetTensorDim(stride_, data_format_, 'N') == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(stride_, data_format_, '0') > 0 &&
+         GetTensorDim(stride_, data_format_, '1') > 0 &&
+         GetTensorDim(stride_, data_format_, '2') > 0),
+        errors::InvalidArgument("Spatial strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
     cudnn_use_autotune_ = CudnnUseAutotune();
   }
@@ -471,6 +540,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
     if (filter_size[0] == 1 && filter_size[1] == 1 && filter_size[2] == 1 &&
+        dilation_[0] == 1 && dilation_[1] == 1 && dilation_[2] == 1 &&
         stride_[0] == 1 && stride_[1] == 1 && stride_[2] == 1 &&
         data_format_ == FORMAT_NHWC) {
       const uint64 m = batch * input_size[0] * input_size[1] * input_size[2];
@@ -580,7 +650,10 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
         .set_input_feature_map_count(in_depth)
         .set_output_feature_map_count(out_depth);
     perftools::gputools::dnn::ConvolutionDescriptor conv_desc(3);
-    conv_desc.set_filter_stride(DimIndex::X, strides[2])
+    conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
+        .set_dilation_rate(DimIndex::Y, dilations[1])
+        .set_dilation_rate(DimIndex::Z, dilations[0])
+        .set_filter_stride(DimIndex::X, strides[2])
         .set_filter_stride(DimIndex::Y, strides[1])
         .set_filter_stride(DimIndex::Z, strides[0])
         .set_zero_padding(DimIndex::X, padding_cols / 2)
@@ -645,9 +718,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
         {{input_size[0], input_size[1], input_size[2]}},
         out_depth,
         {{filter_size[0], filter_size[1], filter_size[2]}},
-        // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
-        // conv is supported.
-        /*dilation=*/{{1, 1, 1}},
+        {{dilations[0], dilations[1], dilations[2]}},
         {{strides[0], strides[1], strides[2]}},
         {{padding_planes, padding_rows, padding_cols}},
         dtype,
@@ -755,6 +826,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilation_;
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
@@ -784,6 +856,22 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
       OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                   errors::InvalidArgument("Invalid data format"));
     }
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+    OP_REQUIRES(context, dilation_.size() == 5,
+                errors::InvalidArgument("Dilation rates field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
+                 GetTensorDim(dilation_, data_format_, 'N') == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilation rates in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(dilation_, data_format_, '0') > 0 &&
+         GetTensorDim(dilation_, data_format_, '1') > 0 &&
+         GetTensorDim(dilation_, data_format_, '2') > 0),
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
     OP_REQUIRES(context, stride_.size() == 5,
                 errors::InvalidArgument("Sliding window strides field must "
@@ -794,6 +882,12 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
          GetTensorDim(stride_, data_format_, 'N') == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(stride_, data_format_, '0') > 0 &&
+         GetTensorDim(stride_, data_format_, '1') > 0 &&
+         GetTensorDim(stride_, data_format_, '2') > 0),
+        errors::InvalidArgument("Spatial strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
     cudnn_use_autotune_ = CudnnUseAutotune();
   }
@@ -820,6 +914,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
     if (filter_size[1] == 1 && filter_size[2] == 1 && filter_size[0] == 1 &&
+        dilations[2] == 1 && dilations[1] == 1 && dilations[0] == 1 &&
         strides[2] == 1 && strides[1] == 1 && strides[0] == 1 &&
         data_format_ == FORMAT_NHWC) {
       const uint64 m = in_depth;
@@ -943,7 +1038,10 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         .set_input_feature_map_count(in_depth)
         .set_output_feature_map_count(out_depth);
     perftools::gputools::dnn::ConvolutionDescriptor conv_desc(3);
-    conv_desc.set_filter_stride(DimIndex::X, strides[2])
+    conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
+        .set_dilation_rate(DimIndex::Y, dilations[1])
+        .set_dilation_rate(DimIndex::Z, dilations[0])
+        .set_filter_stride(DimIndex::X, strides[2])
         .set_filter_stride(DimIndex::Y, strides[1])
         .set_filter_stride(DimIndex::Z, strides[0])
         .set_zero_padding(DimIndex::X, padding_cols / 2)
@@ -1016,7 +1114,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         {{input_size[0], input_size[1], input_size[2]}},
         out_depth,
         {{filter_size[0], filter_size[1], filter_size[2]}},
-        {{1, 1, 1}},
+        {{dilations[0], dilations[1], dilations[2]}},
         {{strides[0], strides[1], strides[2]}},
         {{padding_planes, padding_rows, padding_cols}},
         dtype,
@@ -1102,6 +1200,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilation_;
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 0b7c1524e6..48dd3c9eb0 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -49,12 +49,18 @@ template <typename T>
 struct LaunchConvOp<CPUDevice, T> {
   static void launch(OpKernelContext* context, bool cudnn_use_autotune,
                      const Tensor& input, const Tensor& filter,
+                     const std::array<int64, 3>& dilations,
                      const std::array<int64, 3>& strides, const Padding padding,
                      TensorFormat data_format, Tensor* output) {
     OP_REQUIRES(context, data_format == FORMAT_NHWC,
                 errors::InvalidArgument("CPU implementation of Conv3D "
                                         "currently only supports the NHWC "
                                         "tensor format."));
+    OP_REQUIRES(context,
+                dilations[0] == 1 && dilations[1] == 1 && dilations[2] == 1,
+                errors::InvalidArgument("CPU implementation of Conv3D "
+                                        "currently only supports dilated rates "
+                                        "of 1."));
     functor::CuboidConvolution<CPUDevice, T>()(
         context->eigen_device<CPUDevice>(), output->tensor<T, 5>(),
         input.tensor<T, 5>(), filter.tensor<T, 5>(), strides[2], strides[1],
@@ -80,6 +86,28 @@ class Conv3DOp : public BinaryOp<T> {
          GetTensorDim(stride_, data_format_, 'C') == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(stride_, data_format_, '0') > 0 &&
+         GetTensorDim(stride_, data_format_, '1') > 0 &&
+         GetTensorDim(stride_, data_format_, '2') > 0),
+        errors::InvalidArgument("Spatial strides should be larger than 0."));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+    OP_REQUIRES(context, dilation_.size() == 5,
+                errors::InvalidArgument("Dilation rates field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, 'N') == 1 &&
+                 GetTensorDim(dilation_, data_format_, 'C') == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilation rates in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(dilation_, data_format_, '0') > 0 &&
+         GetTensorDim(dilation_, data_format_, '1') > 0 &&
+         GetTensorDim(dilation_, data_format_, '2') > 0),
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
     cudnn_use_autotune_ = CudnnUseAutotune();
   }
@@ -115,13 +143,18 @@ class Conv3DOp : public BinaryOp<T> {
          GetTensorDim(input, data_format_, '2')}};
     std::array<int64, 3> filter_size = {
         {filter.dim_size(0), filter.dim_size(1), filter.dim_size(2)}};
+    std::array<int64, 3> dilations = {
+        {GetTensorDim(dilation_, data_format_, '0'),
+         GetTensorDim(dilation_, data_format_, '1'),
+         GetTensorDim(dilation_, data_format_, '2')}};
     std::array<int64, 3> strides = {{GetTensorDim(stride_, data_format_, '0'),
                                      GetTensorDim(stride_, data_format_, '1'),
                                      GetTensorDim(stride_, data_format_, '2')}};
     std::array<int64, 3> out, padding;
 
-    OP_REQUIRES_OK(context, Get3dOutputSize(input_size, filter_size, strides,
-                                            padding_, &out, &padding));
+    OP_REQUIRES_OK(
+        context, Get3dOutputSizeV2(input_size, filter_size, dilations, strides,
+                                   padding_, &out, &padding));
     TensorShape out_shape = ShapeFromFormat(
         data_format_, in_batch, {{out[0], out[1], out[2]}}, out_depth);
     Tensor* output;
@@ -131,10 +164,12 @@ class Conv3DOp : public BinaryOp<T> {
     if (out_shape.num_elements() == 0) return;
 
     LaunchConvOp<Device, T>::launch(context, cudnn_use_autotune_, input, filter,
-                                    strides, padding_, data_format_, output);
+                                    dilations, strides, padding_, data_format_,
+                                    output);
   }
 
  private:
+  std::vector<int32> dilation_;
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
@@ -165,6 +200,7 @@ template <typename T>
 struct LaunchConvOp<GPUDevice, T> {
   static void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
                      const Tensor& input_param, const Tensor& filter,
+                     const std::array<int64, 3>& dilations,
                      const std::array<int64, 3>& strides, const Padding padding,
                      TensorFormat data_format, Tensor* output) {
     auto* stream = ctx->op_device_context()->stream();
@@ -199,6 +235,7 @@ struct LaunchConvOp<GPUDevice, T> {
 
     // NOTE: This only works in NHWC.
     if (filter_planes == 1 && filter_rows == 1 && filter_cols == 1 &&
+        dilations[0] == 1 && dilations[1] == 1 && dilations[2] == 1 &&
         strides[0] == 1 && strides[1] == 1 && strides[2] == 1 &&
         data_format == FORMAT_NHWC) {
       // 1x1 filter, so call cublas directly.
@@ -330,7 +367,10 @@ struct LaunchConvOp<GPUDevice, T> {
         .set_input_feature_map_count(in_depth)
         .set_output_feature_map_count(out_depth);
     perftools::gputools::dnn::ConvolutionDescriptor conv_desc(3);
-    conv_desc.set_filter_stride(DimIndex::X, strides[2])
+    conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
+        .set_dilation_rate(DimIndex::Y, dilations[1])
+        .set_dilation_rate(DimIndex::Z, dilations[0])
+        .set_filter_stride(DimIndex::X, strides[2])
         .set_filter_stride(DimIndex::Y, strides[1])
         .set_filter_stride(DimIndex::Z, strides[0])
         .set_zero_padding(DimIndex::X, pad_cols / 2)
@@ -377,9 +417,7 @@ struct LaunchConvOp<GPUDevice, T> {
         {{in_planes, in_rows, in_cols}},
         out_depth,
         {{filter_planes, filter_rows, filter_cols}},
-        // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
-        // conv is supported.
-        /*dilation=*/{{1, 1, 1}},
+        {{dilations[0], dilations[1], dilations[2]}},
         {{strides[0], strides[1], strides[2]}},
         {{pad_planes, pad_rows, pad_cols}},
         dtype,
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 12d6dc5eaf..6dc3d9df31 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -524,6 +524,7 @@ REGISTER_OP("Conv3DBackpropInput")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Deprecated(10, "Use Conv3DBackpropInputV2")
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 5);
     });
@@ -537,6 +538,7 @@ REGISTER_OP("Conv3DBackpropFilter")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Deprecated(10, "Use Conv3DBackpropFilterV2")
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle out;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 5, &out));
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index f4616fd661..0b531125f3 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -61,18 +62,18 @@ class Conv3DTest(test.TestCase):
 
   def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, stride,
                             padding, data_format, dtype, use_gpu):
-    total_size_1 = 1
-    total_size_2 = 1
+    total_size_tensor = 1
+    total_size_filter = 1
     for s in tensor_in_sizes:
-      total_size_1 *= s
+      total_size_tensor *= s
     for s in filter_in_sizes:
-      total_size_2 *= s
+      total_size_filter *= s
 
     # Initializes the input tensor with array containing numbers from 0 to 1.
     # We keep the input tensor values fairly small to avoid overflowing float16
     # during the conv3d.
-    x1 = [f * 1.0 / total_size_1 for f in range(1, total_size_1 + 1)]
-    x2 = [f * 1.0 / total_size_2 for f in range(1, total_size_2 + 1)]
+    x1 = [f * 1.0 / total_size_tensor for f in range(1, total_size_tensor + 1)]
+    x2 = [f * 1.0 / total_size_filter for f in range(1, total_size_filter + 1)]
     with self.test_session(use_gpu=use_gpu):
       t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype)
       t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype)
@@ -118,6 +119,79 @@ class Conv3DTest(test.TestCase):
 
           self.assertAllClose(expected, value.flatten(), atol=tol, rtol=tol)
 
+  def _ComputeReferenceDilatedConv(self, tensor_in_sizes, filter_in_sizes,
+                                   stride, dilation, padding, data_format,
+                                   use_gpu):
+    total_size_tensor = 1
+    total_size_filter = 1
+    for s in tensor_in_sizes:
+      total_size_tensor *= s
+    for s in filter_in_sizes:
+      total_size_filter *= s
+
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x1 = [f * 1.0 for f in range(1, total_size_tensor + 1)]
+    x2 = [f * 1.0 for f in range(1, total_size_filter + 1)]
+    with self.test_session(use_gpu=use_gpu):
+      t1 = constant_op.constant(x1, shape=tensor_in_sizes)
+      t2 = constant_op.constant(x2, shape=filter_in_sizes)
+      if isinstance(stride, collections.Iterable):
+        strides = list(stride)
+      else:
+        strides = [stride, stride, stride]
+      if data_format == "NCDHW":
+        t1 = test_util.NHWCToNCHW(t1)
+        full_strides = [1, 1] + strides
+        full_dilation = [1, 1] + dilation
+      else:
+        full_strides = [1] + strides + [1]
+        full_dilation = [1] + dilation + [1]
+      expected = nn_ops.convolution(
+          t1,
+          t2,
+          padding=padding,
+          strides=strides,
+          dilation_rate=dilation,
+          data_format=data_format)
+      computed = nn_ops.conv3d(
+          t1,
+          t2,
+          strides=full_strides,
+          dilations=full_dilation,
+          padding=padding,
+          data_format=data_format)
+      if data_format == "NCDHW":
+        expected = test_util.NCHWToNHWC(expected)
+        computed = test_util.NCHWToNHWC(computed)
+    return expected, computed
+
+  def _VerifyDilatedConvValues(self, tensor_in_sizes, filter_in_sizes, stride,
+                               padding, dilations):
+    expected_results = []
+    computed_results = []
+    default_dilations = (
+        dilations[0] == 1 and dilations[1] == 1 and dilations[2] == 1)
+    for data_format, use_gpu in GetTestConfigs():
+      # If any dilation rate is larger than 1, only do test on the GPU
+      # because we currently do not have a CPU implementation for arbitrary
+      # dilation rates.
+      if default_dilations or use_gpu:
+        expected, computed = self._ComputeReferenceDilatedConv(
+            tensor_in_sizes, filter_in_sizes, stride, dilations, padding,
+            data_format, use_gpu)
+        expected_results.append(expected)
+        computed_results.append(computed)
+        tolerance = 1e-2 if use_gpu else 1e-5
+        with self.test_session() as sess:
+          expected_values = sess.run(expected_results)
+          computed_values = sess.run(computed_results)
+          for e_value, c_value in zip(expected_values, computed_values):
+            print("expected = ", e_value)
+            print("actual = ", c_value)
+            self.assertAllClose(
+                e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-6)
+
   def testConv3D1x1x1Filter(self):
     expected_output = [
         0.18518519, 0.22222222, 0.25925926, 0.40740741, 0.5, 0.59259259,
@@ -145,6 +219,15 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         expected=expected_output)
 
+  def testConv3D1x1x1Filter2x1x1Dilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[1, 3, 6, 1, 1],
+          filter_in_sizes=[1, 1, 1, 1, 1],
+          stride=1,
+          padding="VALID",
+          dilations=[2, 1, 1])
+
   # Expected values computed using scipy's correlate function.
   def testConv3D2x2x2Filter(self):
     expected_output = [
@@ -161,6 +244,15 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         expected=expected_output)
 
+  def testConv3D2x2x2Filter1x2x1Dilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[1, 4, 6, 3, 1],
+          filter_in_sizes=[2, 2, 2, 1, 1],
+          stride=1,
+          padding="VALID",
+          dilations=[1, 2, 1])
+
   def testConv3DStrides(self):
     expected_output = [
         0.06071429, 0.08988095, 0.10238095, 0.11488095, 0.12738095, 0.13988095,
@@ -546,6 +638,98 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=False)
 
+  # Testing for backprops
+  def _RunAndVerifyBackprop(self, input_sizes, filter_sizes, output_sizes,
+                            strides, dilations, padding, data_format, use_gpu,
+                            err, mode):
+    total_input_size = 1
+    total_filter_size = 1
+    for s in input_sizes:
+      total_input_size *= s
+    for s in filter_sizes:
+      total_filter_size *= s
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x1 = [f * 1.0 for f in range(1, total_input_size + 1)]
+    x2 = [f * 1.0 for f in range(1, total_filter_size + 1)]
+    default_dilations = (
+        dilations[0] == 1 and dilations[1] == 1 and dilations[2] == 1)
+
+    # If any dilation rate is larger than 1, only do test on the GPU
+    # because we currently do not have a CPU implementation for arbitrary
+    # dilation rates.
+    if default_dilations or use_gpu:
+      with self.test_session(use_gpu=use_gpu) as sess:
+        if data_format == "NCDHW":
+          input_sizes = test_util.NHWCToNCHW(input_sizes)
+        t1 = constant_op.constant(x1, shape=input_sizes)
+        t2 = constant_op.constant(x2, shape=filter_sizes)
+        full_strides = [1] + strides + [1]
+        full_dilations = [1] + dilations + [1]
+        if data_format == "NCDHW":
+          full_strides = test_util.NHWCToNCHW(full_strides)
+          full_dilations = test_util.NHWCToNCHW(full_dilations)
+        actual = nn_ops.conv3d(
+            t1,
+            t2,
+            strides=full_strides,
+            dilations=full_dilations,
+            padding=padding,
+            data_format=data_format)
+        expected = nn_ops.convolution(
+            t1,
+            t2,
+            padding=padding,
+            strides=strides,
+            dilation_rate=dilations,
+            data_format=data_format)
+        if data_format == "NCDHW":
+          actual = test_util.NCHWToNHWC(actual)
+          expected = test_util.NCHWToNHWC(expected)
+        actual_grad = gradients_impl.gradients(actual, t1
+                                               if mode == "input" else t2)[0]
+        expected_grad = gradients_impl.gradients(expected, t1
+                                                 if mode == "input" else t2)[0]
+        # "values" consists of two tensors for two backprops
+        actual_value = sess.run(actual_grad)
+        expected_value = sess.run(expected_grad)
+        self.assertShapeEqual(actual_value, actual_grad)
+        self.assertShapeEqual(expected_value, expected_grad)
+      print("expected = ", expected_value)
+      print("actual = ", actual_value)
+      self.assertArrayNear(expected_value.flatten(), actual_value.flatten(),
+                           err)
+
+  def testConv3D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackprop(
+            input_sizes=[1, 3, 6, 1, 1],
+            filter_sizes=[2, 2, 1, 1, 1],
+            output_sizes=[1, 1, 5, 1, 1],
+            strides=[1, 1, 1],
+            dilations=[2, 1, 1],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5,
+            mode="filter")
+
+  def testConv3D2x2Depth3ValidBackpropInputStride1x1Dilation2x1(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackprop(
+            input_sizes=[1, 3, 6, 1, 1],
+            filter_sizes=[2, 2, 1, 1, 1],
+            output_sizes=[1, 1, 5, 1, 1],
+            strides=[1, 1, 1],
+            dilations=[2, 1, 1],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5,
+            mode="input")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 4af5bd26dd..3a41391340 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -94,6 +94,7 @@ def _Conv3DGrad(op, grad):
           array_ops.shape(op.inputs[0]),
           op.inputs[1],
           grad,
+          dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format),
@@ -101,6 +102,7 @@ def _Conv3DGrad(op, grad):
           op.inputs[0],
           array_ops.shape(op.inputs[1]),
           grad,
+          dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format)
@@ -116,12 +118,14 @@ def _Conv3DBackpropInputGrad(op, grad):
           grad,
           array_ops.shape(op.inputs[1]),
           op.inputs[2],
+          dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format),
       nn_ops.conv3d(
           grad,
           op.inputs[1],
+          dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format)
@@ -136,12 +140,14 @@ def _Conv3DBackpropFilterGrad(op, grad):
           array_ops.shape(op.inputs[0]),
           grad,
           op.inputs[2],
+          dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format), None,
       nn_ops.conv3d(
           op.inputs[0],
           grad,
+          dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format)
-- 
GitLab


From a175841eb549f069ac205fb32bf55314a387fe6d Mon Sep 17 00:00:00 2001
From: jinghuangintel <jing1.huang@intel.com>
Date: Fri, 20 Apr 2018 12:20:00 -0700
Subject: [PATCH 240/450] [INTEL MKLDNN]: Upgrade mkldnn version to v13
 (#18508)

* upgrade mkldnn version to v13

* upgrade mkldnn version to v13 for all platforms
---
 tensorflow/workspace.bzl | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index c58ef87338..f0a81f7754 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -50,31 +50,31 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   mkl_repository(
       name = "mkl_linux",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.12/mklml_lnx_2018.0.1.20171227.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.12/mklml_lnx_2018.0.1.20171227.tgz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
       ],
-      sha256 = "feacc3d82565c1231470359b42c696236fae873704e0b013436afba5fd4fd30f",
-      strip_prefix = "mklml_lnx_2018.0.1.20171227",
+      sha256 = "74844bd77294742bf2396ff040369d1aa4cdd9e826fcd38cf8398ae83564d146",
+      strip_prefix = "mklml_lnx_2018.0.2.20180127",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_windows",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.12/mklml_win_2018.0.1.20171227.zip",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.12/mklml_win_2018.0.1.20171227.zip"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip"
       ],
-      sha256 = "24bae8d7b22b431a654acadea43f2243c46ae6b1e5a73a4a936825f31d284ee4",
-      strip_prefix = "mklml_win_2018.0.1.20171227",
+      sha256 = "d8fbf0faa0684bffa3548005d05fe5cfe56ff9dbc0e15e7612d7ac01055a6ded",
+      strip_prefix = "mklml_win_2018.0.2.20180127",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_darwin",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.12/mklml_mac_2018.0.1.20171227.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.12/mklml_mac_2018.0.1.20171227.tgz"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz"
       ],
-      sha256 = "0e954ec6fd3dc5e37f64c4043f6b5613dd687558da3df1028b3b7c29ff5cf77f",
-      strip_prefix = "mklml_mac_2018.0.1.20171227",
+      sha256 = "aa740d71e14562bfea56e6829e6dc186e7487cbcf6748a88dec73826b7ec1943",
+      strip_prefix = "mklml_mac_2018.0.2.20180127",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
 
@@ -85,11 +85,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "mkl_dnn",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.12.tar.gz",
-          "https://github.com/intel/mkl-dnn/archive/v0.12.tar.gz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
+          "https://github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
       ],
-      sha256 = "86fa2a8c12a56e3b725945acedeaa82492746be02545aba6d710f097e013e19e",
-      strip_prefix = "mkl-dnn-0.12",
+      sha256 = "d2cfd93a70cfe86ebe054477c530c9b5c1218b70f75856eb6d1956c68ee89e8f",
+      strip_prefix = "mkl-dnn-0.13",
       build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
   )
 
-- 
GitLab


From b23e91d247368f2046dae035b5c7bdda56512077 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 12:37:39 -0700
Subject: [PATCH 241/450] Changed tf_to_tflite build rule.

PiperOrigin-RevId: 193707628
---
 tensorflow/contrib/lite/build_def.bzl | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index b8f6b7fd59..8521677682 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -124,19 +124,19 @@ def tf_to_tflite(name, src, options, out):
     out: name of the output flatbuffer file.
   """
 
-  toco = "//tensorflow/contrib/lite/toco:toco"
+  toco_cmdline = " ".join([
+      "//tensorflow/contrib/lite/toco:toco",
+      "--input_format=TENSORFLOW_GRAPHDEF",
+      "--output_format=TFLITE",
+      ("--input_file=$(location %s)" % src),
+      ("--output_file=$(location %s)" % out),
+  ] + options )
   native.genrule(
       name = name,
-      srcs=[src, options],
+      srcs=[src],
       outs=[out],
-      cmd = ("$(location %s) " +
-             "   --input_file=$(location %s) " +
-             "   --output_file=$(location %s) " +
-             "   --input_format=TENSORFLOW_GRAPHDEF" +
-             "   --output_format=TFLITE" +
-             "   `cat $(location %s)`")
-            % (toco, src, out, options),
-      tools= [toco],
+      cmd = toco_cmdline,
+      tools= ["//tensorflow/contrib/lite/toco:toco"],
   )
 
 def tflite_to_json(name, src, out):
-- 
GitLab


From 517d1912f4ec71180944320350a3694332a1dedc Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Fri, 20 Apr 2018 12:40:57 -0700
Subject: [PATCH 242/450] Add a utility to visualize object-based checkpoints

Useful for generating a warm fuzzy feeling that everything you think should be saved was saved, and for explaining what object-based checkpointing is. (Also useful on the former front will be a planned "assert that all of this Graph's trainable variables are accessible from object X" function.)

Somewhat hacky since it generates strings rather than using the pydot bindings (and so works without a pydot dependency).

PiperOrigin-RevId: 193708003
---
 tensorflow/contrib/BUILD                      |   1 +
 tensorflow/contrib/checkpoint/__init__.py     |   3 +
 tensorflow/contrib/checkpoint/python/BUILD    |  32 +++++
 .../contrib/checkpoint/python/visualize.py    | 111 ++++++++++++++++++
 .../checkpoint/python/visualize_test.py       |  97 +++++++++++++++
 5 files changed, 244 insertions(+)
 create mode 100644 tensorflow/contrib/checkpoint/python/visualize.py
 create mode 100644 tensorflow/contrib/checkpoint/python/visualize_test.py

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 7e47516550..d28392a62c 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -25,6 +25,7 @@ py_library(
         "//tensorflow/contrib/batching:batch_py",
         "//tensorflow/contrib/bayesflow:bayesflow_py",
         "//tensorflow/contrib/boosted_trees:init_py",
+        "//tensorflow/contrib/checkpoint/python:checkpoint",
         "//tensorflow/contrib/cloud:cloud_py",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
diff --git a/tensorflow/contrib/checkpoint/__init__.py b/tensorflow/contrib/checkpoint/__init__.py
index 70d7d2d8d7..1192cc44a1 100644
--- a/tensorflow/contrib/checkpoint/__init__.py
+++ b/tensorflow/contrib/checkpoint/__init__.py
@@ -16,6 +16,7 @@
 
 
 For creating and managing dependencies:
+@@dot_graph_from_checkpoint
 @@split_dependency
 """
 
@@ -24,6 +25,8 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.checkpoint.python.split_dependency import split_dependency
+from tensorflow.contrib.checkpoint.python.visualize import dot_graph_from_checkpoint
+
 from tensorflow.python.util.all_util import remove_undocumented
 
 remove_undocumented(module_name=__name__)
diff --git a/tensorflow/contrib/checkpoint/python/BUILD b/tensorflow/contrib/checkpoint/python/BUILD
index d57b01aab2..a5681ffa61 100644
--- a/tensorflow/contrib/checkpoint/python/BUILD
+++ b/tensorflow/contrib/checkpoint/python/BUILD
@@ -4,6 +4,15 @@ package(default_visibility = ["//tensorflow:internal"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
+py_library(
+    name = "checkpoint",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":split_dependency",
+        ":visualize",
+    ],
+)
+
 py_library(
     name = "split_dependency",
     srcs = ["split_dependency.py"],
@@ -27,3 +36,26 @@ py_test(
         "//tensorflow/python/eager:test",
     ],
 )
+
+py_library(
+    name = "visualize",
+    srcs = ["visualize.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:pywrap_tensorflow",
+    ],
+)
+
+py_test(
+    name = "visualize_test",
+    srcs = ["visualize_test.py"],
+    deps = [
+        ":visualize",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/eager:test",
+    ],
+)
diff --git a/tensorflow/contrib/checkpoint/python/visualize.py b/tensorflow/contrib/checkpoint/python/visualize.py
new file mode 100644
index 0000000000..86fbdb41d2
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/python/visualize.py
@@ -0,0 +1,111 @@
+"""Utilities for visualizing dependency graphs."""
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import checkpointable_object_graph_pb2
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.training import checkpointable
+
+
+def dot_graph_from_checkpoint(save_path):
+  r"""Visualizes an object-based checkpoint (from `tf.train.Checkpoint`).
+
+  Useful for inspecting checkpoints and debugging loading issues.
+
+  Example usage from Python (requires pydot):
+  ```python
+  import tensorflow as tf
+  import pydot
+
+  dot_string = tf.contrib.checkpoint.dot_graph_from_checkpoint('/path/to/ckpt')
+  parsed, = pydot.graph_from_dot_data(dot_string)
+  parsed.write_svg('/tmp/tensorflow/visualized_checkpoint.svg')
+  ```
+
+  Example command line usage:
+  ```sh
+  python -c "import tensorflow as tf;\
+    print(tf.contrib.checkpoint.dot_graph_from_checkpoint('/path/to/ckpt'))"\
+    | dot -Tsvg > /tmp/tensorflow/checkpoint_viz.svg
+  ```
+
+  Args:
+    save_path: The checkpoint prefix, as returned by `tf.train.Checkpoint.save`
+      or `tf.train.latest_checkpoint`.
+  Returns:
+    A graph in DOT format as a string.
+  """
+  reader = pywrap_tensorflow.NewCheckpointReader(save_path)
+  try:
+    object_graph_string = reader.get_tensor(
+        checkpointable.OBJECT_GRAPH_PROTO_KEY)
+  except errors_impl.NotFoundError:
+    raise ValueError(
+        ('The specified checkpoint "%s" does not appear to be object-based (it '
+         'is missing the key "%s"). Likely it was created with a name-based '
+         'saver and does not contain an object dependency graph.') % (
+             save_path, checkpointable.OBJECT_GRAPH_PROTO_KEY))
+  shape_map = reader.get_variable_to_shape_map()
+  dtype_map = reader.get_variable_to_dtype_map()
+  object_graph = (
+      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+  object_graph.ParseFromString(object_graph_string)
+  graph = 'digraph {\n'
+  def _escape(name):
+    return name.replace('"', '\\"')
+  slot_ids = set()
+  for node in object_graph.nodes:
+    for slot_reference in node.slot_variables:
+      slot_ids.add(slot_reference.slot_variable_node_id)
+  for node_id, node in enumerate(object_graph.nodes):
+    if (len(node.attributes) == 1
+        and node.attributes[0].name == checkpointable.VARIABLE_VALUE_KEY):
+      if node_id in slot_ids:
+        color = 'orange'
+        tooltip_prefix = 'Slot variable'
+      else:
+        color = 'blue'
+        tooltip_prefix = 'Variable'
+      attribute = node.attributes[0]
+      graph += ('N_%d [shape=point label="" color=%s width=.25'
+                ' tooltip="%s %s shape=%s %s"]\n') % (
+                    node_id,
+                    color,
+                    tooltip_prefix,
+                    _escape(attribute.full_name),
+                    shape_map[attribute.checkpoint_key],
+                    dtype_map[attribute.checkpoint_key].name)
+    elif node.slot_variables:
+      graph += ('N_%d [shape=point label="" width=.25 color=red,'
+                'tooltip="Optimizer"]\n') % node_id
+    else:
+      graph += 'N_%d [shape=point label="" width=.25]\n' % node_id
+    for reference in node.children:
+      graph += 'N_%d -> N_%d [label="%s"]\n' % (
+          node_id, reference.node_id, _escape(reference.local_name))
+    for slot_reference in node.slot_variables:
+      graph += 'N_%d -> N_%d [label="%s" style=dotted]\n' % (
+          node_id,
+          slot_reference.slot_variable_node_id,
+          _escape(slot_reference.slot_name))
+      graph += 'N_%d -> N_%d [style=dotted]\n' % (
+          slot_reference.original_variable_node_id,
+          slot_reference.slot_variable_node_id)
+  graph += '}\n'
+  return graph
diff --git a/tensorflow/contrib/checkpoint/python/visualize_test.py b/tensorflow/contrib/checkpoint/python/visualize_test.py
new file mode 100644
index 0000000000..1d9ab78923
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/python/visualize_test.py
@@ -0,0 +1,97 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+from tensorflow.contrib.checkpoint.python import visualize
+
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.keras._impl.keras.layers import core
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import adam
+from tensorflow.python.training import checkpointable_utils
+
+try:
+  import pydot  # pylint: disable=g-import-not-at-top
+except ImportError:
+  pydot = None
+
+
+class MyModel(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(MyModel, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
+class DotGraphTests(test.TestCase):
+
+  def testMakeDotGraph(self):
+    with context.eager_mode():
+      input_value = constant_op.constant([[3.]])
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      optimizer_step = resource_variable_ops.ResourceVariable(12)
+      save_checkpoint = checkpointable_utils.Checkpoint(
+          optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+      optimizer.minimize(functools.partial(model, input_value))
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
+      save_path = save_checkpoint.save(checkpoint_prefix)
+      prefix = save_checkpoint.save(save_path)
+
+    dot_graph_string = visualize.dot_graph_from_checkpoint(prefix)
+
+    # The remainder of this test is more-or-less optional since it's so
+    # dependent on pydot/platform/Python versions.
+    if pydot is None:
+      self.skipTest('pydot is required for the remainder of this test.')
+    try:
+      parsed, = pydot.graph_from_dot_data(dot_graph_string)
+    except NameError as e:
+      if "name 'dot_parser' is not defined" in str(e):
+        self.skipTest("pydot isn't working")
+      else:
+        raise
+    # Check that the graph isn't completely trivial
+    self.assertEqual(
+        '"model"',
+        parsed.obj_dict['edges'][('N_0', 'N_1')][0]['attributes']['label'])
+    image_path = os.path.join(self.get_temp_dir(), 'saved.svg')
+    try:
+      parsed.write_svg(image_path)
+    except Exception as e:  # pylint: disable=broad-except
+      # For some reason PyDot's "dot not available" error is an Exception, not
+      # something more specific.
+      if '"dot" not found in path' in str(e):
+        self.skipTest("pydot won't save SVGs (dot not available)")
+      else:
+        raise
+
+if __name__ == '__main__':
+  test.main()
-- 
GitLab


From 0b6ca72332735fe460da23fbcca5c8c24d838f28 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 13:18:02 -0700
Subject: [PATCH 243/450] Update ops-related pbtxt files.

PiperOrigin-RevId: 193712839
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 124 ++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 |  26 ++++
 2 files changed, 150 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index dbd6f859c4..247f9edf5b 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -13445,6 +13445,68 @@ op {
     version: 10
   }
 }
+op {
+  name: "Conv3DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
 op {
   name: "Conv3DBackpropFilterV2"
   input_arg {
@@ -13718,6 +13780,68 @@ op {
     version: 10
   }
 }
+op {
+  name: "Conv3DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
 op {
   name: "Conv3DBackpropInputV2"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 46afe357f0..d1773daebe 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -5651,6 +5651,19 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
   deprecation {
     version: 10
     explanation: "Use Conv3DBackpropFilterV2"
@@ -5774,6 +5787,19 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
   deprecation {
     version: 10
     explanation: "Use Conv3DBackpropInputV2"
-- 
GitLab


From 02075fa2456d951ff3b7bdb8fee76a1b9c6d8716 Mon Sep 17 00:00:00 2001
From: Guozhong Zhuang <Guozhong.Zhuang@intel.com>
Date: Fri, 20 Apr 2018 13:43:06 -0700
Subject: [PATCH 244/450] MKLDNN: conv2d forward DNN primitive reuse
 enhancement (#17943)

* Enable conv2d fwd primitive reuse

* coding style change based on suggestions from TF team

* minor code style fix

* refactor conv2d primitive reuse class and enhance key creation utility

* refactor by introducing ConvFwdDimensions structure

* change 'Execute' method to be a template one per PR review suggestion

* Per PR review suggestion, update DnnOp class to declared related  method as abstract ones

* refactor AddAsKey method - template for scalar value and remove Execute()which is not used yet

* rename padding_l/_r/pl/pr to padding_left or padding_right as recommended

* parameter and variable renaming - to make them more explicit
---
 tensorflow/core/kernels/mkl_conv_ops.cc | 414 +++++++++++++++++-------
 tensorflow/core/util/mkl_util.h         |  87 ++++-
 2 files changed, 389 insertions(+), 112 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index f0818eb96d..f2b14f1278 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <map>
 #include <string>
 #include <vector>
+#include <memory>
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -42,14 +43,13 @@ limitations under the License.
 #include "tensorflow/core/util/mkl_util.h"
 
 #ifndef INTEL_MKL_ML
-
 #include "mkldnn.hpp"
 
 using mkldnn::prop_kind;
 using mkldnn::stream;
-
-using mkldnn::convolution_direct;
 using mkldnn::convolution_forward;
+using mkldnn::convolution_direct;
+
 #else
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
@@ -57,11 +57,232 @@ using mkldnn::convolution_forward;
 
 namespace tensorflow {
 
+#ifndef INTEL_MKL_ML
+
+struct ConvFwdDimensions {
+  memory::dims src_dims;
+  memory::dims filter_dims;
+  memory::dims bias_dims;
+  memory::dims dst_dims;
+  memory::dims strides;
+  memory::dims dilations;
+  memory::dims padding_left;
+  memory::dims padding_right;
+
+  ConvFwdDimensions(memory::dims src_dims,
+    memory::dims filter_dims, memory::dims bias_dims,
+    memory::dims dst_dims, memory::dims strides,
+    memory::dims dilations, memory::dims padding_left,
+    memory::dims padding_right) :
+      src_dims(src_dims), filter_dims(filter_dims),
+      bias_dims(bias_dims), dst_dims(dst_dims),
+      strides(strides), dilations(dilations),
+      padding_left(padding_left), padding_right(padding_right) {
+  }
+};
+
+template <typename T>
+class Conv2DFwd : public DnnOp {
+ public:
+  explicit Conv2DFwd(const ConvFwdDimensions& convFwdDims) {
+    fwd_stream_.reset(new stream(stream::kind::eager));
+    // create conv primitive
+    if (conv_fwd_ == nullptr) {
+      Setup(convFwdDims);
+    }
+  }
+
+  ~Conv2DFwd() {}
+
+  // Convolution forward execute with bias
+  //   src_data:    input data buffer of src
+  //   filter_data: input data buffer of filter (weights)
+  //   bias_data:   input data buffer of bias
+  //   dst_data:    output data buffer of dst
+  void Execute(T* src_data, T* filter_data, T* bias_data, T* dst_data) {
+    src_mem_->set_data_handle(static_cast<void*>(src_data));
+    filter_mem_->set_data_handle(static_cast<void*>(filter_data));
+    bias_mem_->set_data_handle(static_cast<void*>(bias_data));
+    dst_mem_->set_data_handle(static_cast<void*>(dst_data));
+    fwd_stream_->submit(fwd_primitives_);
+
+    // after exec, set data handle back
+    src_mem_->set_data_handle(DummyData);
+    filter_mem_->set_data_handle(DummyData);
+    bias_mem_->set_data_handle(DummyData);
+    dst_mem_->set_data_handle(DummyData);
+
+    return;
+  }
+
+  // Convolution forward execute without bias
+  //   src_data:    input data buffer of src
+  //   filter_data: input data buffer of filter (weights)
+  //   dst_data:    output data buffer of dst
+  void Execute(T* src_data, T* filter_data, T* dst_data) {
+    src_mem_->set_data_handle(static_cast<void*>(src_data));
+    filter_mem_->set_data_handle(static_cast<void*>(filter_data));
+    dst_mem_->set_data_handle(static_cast<void*>(dst_data));
+    fwd_stream_->submit(fwd_primitives_);
+
+    // after exec, set data handle back
+    src_mem_->set_data_handle(DummyData);
+    filter_mem_->set_data_handle(DummyData);
+    dst_mem_->set_data_handle(DummyData);
+
+    return;
+  }
+
+  // expected memory format for this primitive instance
+  memory::format src_fmt_;
+  memory::format filter_fmt_;
+
+  // convolution primitive
+  std::shared_ptr<mkldnn::convolution_forward::primitive_desc> fwd_pd_;
+  std::shared_ptr<mkldnn::primitive> conv_fwd_;
+
+ private:
+  void Setup(const ConvFwdDimensions& convFwdDims) {
+    // create memory descriptors for convolution data w/ no specified format
+    src_md_.reset(new memory::desc({convFwdDims.src_dims},
+        MklDnnType<T>(), memory::format::any));
+
+    filter_md_.reset(new memory::desc({convFwdDims.filter_dims},
+        MklDnnType<T>(), memory::format::any));
+
+    dst_md_.reset(new memory::desc({convFwdDims.dst_dims},
+        MklDnnType<T>(), memory::format::any));
+
+    if (!convFwdDims.bias_dims.empty())
+        bias_md_.reset(new memory::desc({convFwdDims.bias_dims},
+            MklDnnType<T>(), memory::format::any));
+
+    // create a convolution
+    if (!convFwdDims.bias_dims.empty()) {
+      fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward,
+          convolution_direct, *src_md_, *filter_md_, *bias_md_, *dst_md_,
+          convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left,
+          convFwdDims.padding_right, padding_kind::zero));
+    } else {
+      fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward,
+          convolution_direct, *src_md_, *filter_md_, *dst_md_,
+          convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left,
+          convFwdDims.padding_right, padding_kind::zero));
+    }
+
+    fwd_pd_.reset(new convolution_forward::primitive_desc(
+        *fwd_desc_, cpu_engine_));
+
+    // store the expected memory format
+    src_fmt_ = static_cast<mkldnn::memory::format>(
+        fwd_pd_.get()->src_primitive_desc().desc().data.format);
+
+    filter_fmt_ = static_cast<mkldnn::memory::format>(
+        fwd_pd_.get()->weights_primitive_desc().desc().data.format);
+
+    // create memory primitive based on dummy data
+    src_mem_.reset(new memory(fwd_pd_.get()->src_primitive_desc(), DummyData));
+    filter_mem_.reset(new memory(fwd_pd_.get()->weights_primitive_desc(),
+                      DummyData));
+    dst_mem_.reset(new memory(fwd_pd_.get()->dst_primitive_desc(), DummyData));
+
+    // create convolution primitive and add it to net
+    if (!convFwdDims.bias_dims.empty()) {
+        bias_mem_.reset(new memory({{{convFwdDims.bias_dims}, MklDnnType<T>(),
+                        memory::format::x}, cpu_engine_}, DummyData));
+        conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_,
+                        *filter_mem_, *bias_mem_, *dst_mem_));
+    } else {
+        conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_,
+                        *filter_mem_, *dst_mem_));
+    }
+
+    fwd_primitives_.push_back(*conv_fwd_);
+    return;
+  }
+
+  // MKLDNN memory
+  std::shared_ptr<mkldnn::memory> src_mem_;
+  std::shared_ptr<mkldnn::memory> filter_mem_;
+  std::shared_ptr<mkldnn::memory> bias_mem_;
+  std::shared_ptr<mkldnn::memory> dst_mem_;
+
+  std::shared_ptr<mkldnn::stream> fwd_stream_;
+  std::vector<mkldnn::primitive> fwd_primitives_;
+
+  // desc & prmitive desc
+  std::shared_ptr<mkldnn::convolution_forward::desc> fwd_desc_;
+
+  // memory desc
+  std::shared_ptr<mkldnn::memory::desc> src_md_;
+  std::shared_ptr<mkldnn::memory::desc> filter_md_;
+  std::shared_ptr<mkldnn::memory::desc> bias_md_;
+  std::shared_ptr<mkldnn::memory::desc> dst_md_;
+
+  engine cpu_engine_ = engine(engine::cpu, 0);
+};
+
+template <typename T>
+class Conv2DFwdFactory : public DnnOpFactory<T> {
+ public:
+  static Conv2DFwd<T>* Get(const ConvFwdDimensions& convFwdDims) {
+     Conv2DFwd<T>* conv2d_fwd = nullptr;
+
+     // try to find a suitable one in pool
+     conv2d_fwd = dynamic_cast<Conv2DFwd<T>*> (
+       Conv2DFwdFactory<T>::GetInstance().GetConv2DFwd(convFwdDims));
+
+     if (conv2d_fwd == nullptr) {
+       conv2d_fwd = new Conv2DFwd<T>(convFwdDims);
+       Conv2DFwdFactory<T>::GetInstance().SetConv2DFwd(
+           convFwdDims, conv2d_fwd);
+     }
+     return conv2d_fwd;
+  }
+
+ private:
+  Conv2DFwdFactory() {}
+  ~Conv2DFwdFactory() {}
+
+  static const int kDilationH = 0, kDilationW = 1;
+
+  static Conv2DFwdFactory& GetInstance() {
+    static Conv2DFwdFactory instance_;
+    return instance_;
+  }
+
+  static std::string CreateKey(const ConvFwdDimensions& convFwdDims) {
+    std::string prefix = "conv2d_fwd_";
+    FactoryKeyCreator key_creator;
+    key_creator.AddAsKey(prefix);
+    key_creator.AddAsKey(convFwdDims.src_dims);
+    key_creator.AddAsKey(convFwdDims.filter_dims);
+    key_creator.AddAsKey(convFwdDims.bias_dims);
+    key_creator.AddAsKey(convFwdDims.dst_dims);
+    key_creator.AddAsKey(convFwdDims.strides);
+    key_creator.AddAsKey(convFwdDims.dilations);
+    key_creator.AddAsKey(convFwdDims.padding_left);
+    key_creator.AddAsKey(convFwdDims.padding_right);
+    return key_creator.GetKey();
+  }
+
+  DnnOp* GetConv2DFwd(const ConvFwdDimensions& convFwdDims) {
+    std::string key = CreateKey(convFwdDims);
+    return this->GetOp(key);
+  }
+
+  void SetConv2DFwd(const ConvFwdDimensions& convFwdDims, DnnOp *op) {
+    std::string key = CreateKey(convFwdDims);
+    this->SetOp(key, op);
+  }
+};
+
+#endif
+
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// MKL-DNN is now default. MKL-ML must be specified explicitly.
+// For now, MKL-ML is default. So making MKL-DNN not a default choice.
 #ifdef INTEL_MKL_ML
-
 template <typename Device, typename T, bool biasEnabled>
 class MklConv2DOp : public OpKernel {
  public:
@@ -528,8 +749,6 @@ class MklConv2DOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     try {
-      auto cpu_engine = engine(engine::cpu, 0);
-
       // Input tensors
       const Tensor& src_tensor = MklGetInput(context, kInputIndex_Src);
       const Tensor& filter_tensor = MklGetInput(context, kInputIndex_Filter);
@@ -538,16 +757,16 @@ class MklConv2DOp : public OpKernel {
       GetMklShape(context, kInputIndex_Src, &src_mkl_shape);
       GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape);
       OP_REQUIRES(context, filter_mkl_shape.IsMklTensor() == false,
-                  errors::InvalidArgument("Filter should not be in "
-                                          "Mkl Layout"));
+            errors::InvalidArgument("Filter should not be in "
+            "Mkl Layout"));
 
       MklDnnData<T> src(&cpu_engine);
       MklDnnData<T> filter(&cpu_engine);
-      MklDnnData<T> output(&cpu_engine);
+      MklDnnData<T> dst(&cpu_engine);  // output
 
-      memory::dims src_dims, filter_dims, padding_l, padding_r,
+      memory::dims src_dims, filter_dims, padding_left, padding_right,
                    dilations, strides;
-      memory::dims output_dims_tf_order, output_dims_mkl_order;
+      memory::dims dst_dims_tf_order, dst_dims_mkl_order;
 
       // Get shapes of input tensors in MKL-DNN order
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_,
@@ -555,31 +774,29 @@ class MklConv2DOp : public OpKernel {
       auto src_tf_shape = GetTfShape(context, kInputIndex_Src);
       auto filter_tf_shape = GetTfShape(context, kInputIndex_Filter);
       conv_utl.GetConvFwdSizesInMklOrder(
-          src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides,
-          &dilations, &output_dims_tf_order, &output_dims_mkl_order,
-          &padding_l, &padding_r);
+          src_tf_shape, filter_tf_shape, &src_dims, &filter_dims,
+          &strides, &dilations, &dst_dims_tf_order, &dst_dims_mkl_order,
+          &padding_left, &padding_right);
       if (!context->status().ok()) return;
 
       // Check for corner case - if there is nothing to compute, return.
-      TensorShape output_tf_shape = MklDnnDimsToTFShape(output_dims_tf_order);
+      TensorShape dst_tf_shape = MklDnnDimsToTFShape(dst_dims_tf_order);
 
       // Corner cases: output with 0 elements and 0 batch size.
-      Tensor* output_tensor = nullptr;
-      if (output_tf_shape.num_elements() == 0 || output_dims_tf_order[0] == 0) {
-        // TODO(jbobba): Verify correctness here
-        //               Need semantics for Null MKL tensor
-        MklDnnShape output_mkl_shape;
-        output_mkl_shape.SetMklTensor(false);
-
-        AllocateOutputSetMklShape(context, kOutputIndex_Dst, &output_tensor,
-                                  src_tf_shape, output_mkl_shape);
+      Tensor* dst_tensor = nullptr;
+      if (dst_tf_shape.num_elements() == 0 ||
+          dst_dims_tf_order[0] == 0) {
+        MklDnnShape dst_mkl_shape;
+        dst_mkl_shape.SetMklTensor(false);
+        AllocateOutputSetMklShape(context, kOutputIndex_Dst,
+                    &dst_tensor, src_tf_shape, dst_mkl_shape);
 
         // MklConv2D also outputs converted filter as 2nd output of Conv2D.
         filter_mkl_shape.SetMklTensor(false);
         Tensor* output_filter_tensor = nullptr;
         AllocateOutputSetMklShape(context, kOutputIndex_Filter,
-                                  &output_filter_tensor, filter_tf_shape,
-                                  filter_mkl_shape);
+                                  &output_filter_tensor,
+                                  filter_tf_shape, filter_mkl_shape);
         return;
       }
 
@@ -587,6 +804,7 @@ class MklConv2DOp : public OpKernel {
       // Describe how the inputs and outputs of Convolution look like. Also
       // specify buffers containing actual input and output data.
       auto tf_fmt = TFDataFormatToMklDnnDataFormat(data_format_);
+
       // If input is in MKL layout, then simply grab input layout; otherwise,
       // construct input Tf layout. For TF layout, although input shape
       // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
@@ -595,6 +813,7 @@ class MklConv2DOp : public OpKernel {
                         ? src_mkl_shape.GetMklLayout()
                         : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
       src.SetUsrMem(src_md, &src_tensor);
+
       // Although filter shape (filter_dims) required is in MKL-DNN order,
       // the layout is Tensorflow's layout (HWIO).
       auto filter_md = filter_mkl_shape.IsMklTensor()  // Should NEVER be true
@@ -603,98 +822,70 @@ class MklConv2DOp : public OpKernel {
                                           memory::format::hwio);
       filter.SetUsrMem(filter_md, &filter_tensor);
 
-      // Set output shape (output_dims) required in MKL-DNN order.
-      // Currently, we set output layout as Tensorflow's layout (NHWC or NCHW
-      // depending on data format). But later we propagate Mkl layout of the
-      // output to the next op directly.
-      output.SetUsrMem(output_dims_mkl_order, tf_fmt);
-
-      // Create memory descriptors for convolution data w/ no specified format.
-      src.SetOpMemDesc(src_dims, memory::format::any);
-      filter.SetOpMemDesc(filter_dims, memory::format::any);
-      output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
-
       // MKLDNN dilation starts from 0.
       dilations[kDilationH] -= 1;
       dilations[kDilationW] -= 1;
 
+      // get a conv2d fwd from primitive pool
+      Conv2DFwd<T> *conv2d_fwd = nullptr;
+      if (biasEnabled) {
+        memory::dims bias_dims = {};
+        conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_dims);
+        ConvFwdDimensions convFwdDims(src_dims, filter_dims, bias_dims,
+          dst_dims_mkl_order, strides, dilations, padding_left, padding_right);
+        conv2d_fwd = Conv2DFwdFactory<T>::Get(convFwdDims);
+      } else {
+        ConvFwdDimensions convFwdDims(src_dims, filter_dims, NONE_DIMS,
+          dst_dims_mkl_order, strides, dilations, padding_left, padding_right);
+        conv2d_fwd = Conv2DFwdFactory<T>::Get(convFwdDims);
+      }
+
+      // allocate output tensors output_tensor and filter_out_tensor
+      std::shared_ptr<mkldnn::convolution_forward::primitive_desc>
+      conv_fwd_pd = conv2d_fwd->fwd_pd_;
+      AllocateOutputTensor(context, *conv_fwd_pd,
+                       dst_dims_mkl_order, tf_fmt, &dst_tensor);
+      Tensor* filter_out_tensor = nullptr;
+      AllocateFilterOutputTensor(context, *conv_fwd_pd,
+                                 TFShapeToMklDnnDims(filter_tf_shape),
+                                 &filter_out_tensor);
+
+      T* dst_data = static_cast<T*>(dst_tensor->flat<T>().data());
+
+      // check whether src/filter need reorder
+      std::vector<primitive> net;
+      if (src_md.data.format != conv2d_fwd->src_fmt_)
+          src.CheckReorderToOpMem(
+              conv_fwd_pd.get()->src_primitive_desc(), &net);
+
+      if (filter_md.data.format != conv2d_fwd->filter_fmt_)
+          filter.CheckReorderToOpMem(
+              conv_fwd_pd.get()->weights_primitive_desc(),
+              filter.GetTensorBuffer(filter_out_tensor), &net);
+      stream(stream::kind::eager).submit(net).wait();
+
+      T* src_data = static_cast<T*>(
+                src.GetOpMem().get_data_handle());
+      T* filter_data = static_cast<T*>(
+                filter.GetOpMem().get_data_handle());
+
+      // execute convolution
       if (biasEnabled) {
-          // Create convolution primitive with Bias.
-          MklDnnData<T> bias(&cpu_engine);
-          memory::dims bias_size;
-          conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_size);
-          const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias);
-          bias.SetUsrMem(bias_size, memory::format::x, &bias_tensor);
-          bias.SetOpMemDesc(bias_size, memory::format::any);
-
-          // Create convolution primitive with Bias.
-          // Use MKLDNN dilated convolution in case of dilated rate (>0).
-          auto conv_desc = (dilations[kDilationH] > 0 ||
-              dilations[kDilationW] > 0) ?
-              convolution_forward::desc(prop_kind::forward,
-                      convolution_direct, src.GetOpMemDesc(),
-                      filter.GetOpMemDesc(), bias.GetOpMemDesc(),
-                      output.GetOpMemDesc(), strides, dilations,
-                      padding_l, padding_r,
-                      TFPaddingToMklDnnPadding(padding_)):
-              convolution_forward::desc(prop_kind::forward,
-                      convolution_direct, src.GetOpMemDesc(),
-                      filter.GetOpMemDesc(), bias.GetOpMemDesc(),
-                      output.GetOpMemDesc(), strides,
-                      padding_l, padding_r,
-                      TFPaddingToMklDnnPadding(padding_));
-
-          auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
-                                                                  cpu_engine);
-          AllocateOutputTensor(context, conv_prim_desc,
-                               output_dims_mkl_order, tf_fmt, &output_tensor);
-          // Set data handle for output.
-          output.SetUsrMemDataHandle(output_tensor);
-
-          Tensor* filter_out_tensor = nullptr;
-          AllocateFilterOutputTensor(context, conv_prim_desc,
-                TFShapeToMklDnnDims(filter_tf_shape),
-                &filter_out_tensor);
-
-          PrepareAndExecuteNet(conv_prim_desc, &src, &filter, &bias, &output,
-                               filter_out_tensor);
+        const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias);
+        T* bias_data = static_cast<T*>(const_cast<T*>(
+            bias_tensor.flat<T>().data()));
+
+        conv2d_fwd->Execute(src_data, filter_data, bias_data, dst_data);
       } else {
-          // Create convolution primitive without Bias.
-          // Use MKLDNN dilated convolution in case of dilated rate (>0).
-          auto conv_desc = (dilations[kDilationH] > 0 ||
-            dilations[kDilationW] > 0) ?
-            convolution_forward::desc(prop_kind::forward,
-              convolution_direct, src.GetOpMemDesc(),
-              filter.GetOpMemDesc(), output.GetOpMemDesc(),
-              strides, dilations, padding_l, padding_r,
-              TFPaddingToMklDnnPadding(padding_)):
-          convolution_forward::desc(prop_kind::forward,
-              convolution_direct, src.GetOpMemDesc(),
-              filter.GetOpMemDesc(), output.GetOpMemDesc(),
-              strides, padding_l, padding_r,
-              TFPaddingToMklDnnPadding(padding_));
-
-          auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
-                                                                  cpu_engine);
-          AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order,
-                               tf_fmt, &output_tensor);
-          // Set data handle for output.
-          output.SetUsrMemDataHandle(output_tensor);
-
-          Tensor* filter_out_tensor = nullptr;
-          AllocateFilterOutputTensor(context, conv_prim_desc,
-                TFShapeToMklDnnDims(filter_tf_shape),
-                &filter_out_tensor);
-          PrepareAndExecuteNet(conv_prim_desc, &src, &filter,
-                              nullptr, &output, filter_out_tensor);
+        conv2d_fwd->Execute(src_data, filter_data, dst_data);
       }
-    } catch (mkldnn::error& e) {
+    } catch (mkldnn::error &e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + std::string(e.message) + ", in file " +
-                         std::string(__FILE__) + ":" + std::to_string(__LINE__);
-      OP_REQUIRES_OK(
-          context,
-          errors::Aborted("Operation received an exception:", error_msg));
+                       ", message: " + std::string(e.message) +
+                       ", in file " + std::string(__FILE__) + ":" +
+                       std::to_string(__LINE__);
+      OP_REQUIRES_OK(context,
+        errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -706,6 +897,7 @@ class MklConv2DOp : public OpKernel {
   const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2;
   const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
   const int kDilationH = 0, kDilationW = 1;
+  engine cpu_engine = engine(engine::cpu, 0);
 
   // Allocate output tensor.
   void AllocateOutputTensor(
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index bc6d2d77a4..50a8e30574 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -19,6 +19,8 @@ limitations under the License.
 
 #include <string>
 #include <vector>
+#include <unordered_map>
+#include <utility>
 
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
@@ -1759,7 +1761,90 @@ class MklDnnData {
   }
 };
 
-#endif  // INTEL_MKL_ML
+/// Base class for operations with reuse of DNN primitives
+///
+class DnnOp {
+ public:
+  virtual ~DnnOp() {}
+
+  // Dummy data. Its size, hard-coded as 256 here, does
+  // not matter since MKL should never operate on this buffer.
+  unsigned char DummyData[256];
+};
+
+const mkldnn::memory::dims NONE_DIMS = {};
+// This constant is used to declare dummy buffer (size), for MKL primitives
+template <typename T>
+class DnnOpFactory {
+ public:
+  DnnOpFactory() {}
+  ~DnnOpFactory() {}
+
+  DnnOp* GetOp(const std::string& key) {
+    auto stream_iter = DnnOpFactory<T>::GetHashMap().find(key);
+    if (stream_iter == DnnOpFactory<T>::GetHashMap().end()) {
+      return nullptr;
+    } else {
+      return stream_iter->second;
+    }
+  }
+
+  void SetOp(const std::string& key, DnnOp* op) {
+    auto stream_iter = DnnOpFactory<T>::GetHashMap().find(key);
+
+    CHECK(stream_iter == DnnOpFactory<T>::GetHashMap().end());
+
+    DnnOpFactory<T>::GetHashMap()[key] = op;
+  }
+
+ private:
+  static inline std::unordered_map<std::string, DnnOp*> &GetHashMap() {
+    static thread_local std::unordered_map<std::string, DnnOp*> map_;
+    return map_;
+  }
+};
+
+// utility class for creating keys of MKL primitive pool.
+class FactoryKeyCreator {
+ public:
+  FactoryKeyCreator() {
+    key_.reserve(kMaxKeyLength);
+  }
+
+  ~FactoryKeyCreator() {}
+
+  void AddAsKey(const string &str) {
+    auto buffer = reinterpret_cast<const char *>(str.c_str());
+    Append(buffer, str.length());
+  }
+
+  void AddAsKey(const mkldnn::memory::dims &dims) {
+    for (unsigned int i = 0; i < dims.size(); i++) {
+      AddAsKey<int>(dims[i]);
+    }
+  }
+
+  template <typename T>
+  void AddAsKey(const T data) {
+    auto buffer = reinterpret_cast<const char *>(&data);
+    Append(buffer, sizeof(T));
+  }
+
+  std::string GetKey() {
+    return key_;
+  }
+
+ private:
+  string key_;
+  const char delimiter = 'x';
+  const int kMaxKeyLength = 256;
+  void Append(const char* data, int len) {
+    key_.append(data, len);
+    key_.append(1, delimiter);
+  }
+};
+
+#endif  // INTEL_MKL_DNN
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
-- 
GitLab


From 99167d3a6393ac47c2e01b6f620a03adeb9ac3e4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 13:48:37 -0700
Subject: [PATCH 245/450] Merged commit includes the following changes:
 193717076  by yifeif:

    Automated g4 rollback of changelist 193713153.

--
193716750  by fchollet:

    Refactor `tf.keras.layers.Embedding` layer to use `embedding_lookup` instead of `gather`. This makes the layer TPU-compatible.

--
193716664  by A. Unique TensorFlower:

    Go: Update generated wrapper functions for TensorFlow ops.

--
193713153  by power:

    Experimental Keras TPU compatibility layer.

--

PiperOrigin-RevId: 193717076
---
 tensorflow/go/op/wrappers.go                  | 32 +++++++++++++++++--
 tensorflow/python/keras/BUILD                 |  1 +
 .../keras/_impl/keras/layers/embeddings.py    |  4 +--
 .../_impl/keras/layers/embeddings_test.py     | 13 ++++++++
 4 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 3b3dff0573..ec7d9dcc4f 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -5917,6 +5917,17 @@ func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Conv3DBackpropFilterAttr is an optional argument to Conv3DBackpropFilter.
+type Conv3DBackpropFilterAttr func(optionalAttr)
+
+// Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
 // Computes the gradients of 3-D convolution with respect to the filter.
 //
 // DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
@@ -5930,11 +5941,14 @@ func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
 //	strides: 1-D tensor of length 5. The stride of the sliding window for each
 // dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
+func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "Conv3DBackpropFilter",
 		Input: []tf.Input{
@@ -12306,6 +12320,17 @@ func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, pa
 	return op.Output(0)
 }
 
+// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
+type Conv3DBackpropInputAttr func(optionalAttr)
+
+// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
 // Computes the gradients of 3-D convolution with respect to the input.
 //
 // DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
@@ -12319,11 +12344,14 @@ func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, pa
 //	strides: 1-D tensor of length 5. The stride of the sliding window for each
 // dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
-func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "Conv3DBackpropInput",
 		Input: []tf.Input{
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 70040b7e74..1c58553156 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -208,6 +208,7 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:distribute",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings.py b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
index 591bab7cd8..07b8726b85 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
@@ -24,7 +24,7 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
-from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -155,7 +155,7 @@ class Embedding(Layer):
   def call(self, inputs):
     if K.dtype(inputs) != 'int32':
       inputs = math_ops.cast(inputs, 'int32')
-    out = array_ops.gather(self.embeddings, inputs)
+    out = embedding_ops.embedding_lookup(self.embeddings, inputs)
     return out
 
   def get_config(self):
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py b/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
index 9f6793eac8..6ebf5dc94a 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
@@ -65,6 +67,17 @@ class EmbeddingTest(test.TestCase):
         input_dtype='int32',
         expected_output_dtype='float32')
 
+  def test_embedding_correctness(self):
+    with self.test_session():
+      layer = keras.layers.Embedding(output_dim=2, input_dim=2)
+      layer.build((None, 2))
+      matrix = np.array([[1, 1], [2, 2]])
+      layer.set_weights([matrix])
+
+      inputs = keras.backend.constant([[0, 1, 0]], dtype='int32')
+      outputs = keras.backend.eval(layer(inputs))
+      self.assertAllClose(outputs, [[[1, 1], [2, 2], [1, 1]]])
+
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From 5a4356be6822dfe0b0f973852b9b65d69e4c169c Mon Sep 17 00:00:00 2001
From: Brian Patton <bjp@google.com>
Date: Fri, 20 Apr 2018 13:54:00 -0700
Subject: [PATCH 246/450] Fix for: Suggest braces around initialization of
 subobject.

PiperOrigin-RevId: 193717872
---
 tensorflow/python/lib/core/bfloat16.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/lib/core/bfloat16.cc b/tensorflow/python/lib/core/bfloat16.cc
index 7f07deebef..77fa2c1f66 100644
--- a/tensorflow/python/lib/core/bfloat16.cc
+++ b/tensorflow/python/lib/core/bfloat16.cc
@@ -616,8 +616,8 @@ bool Initialize() {
   };
 
   // Comparisons
-  const std::array<int, 3> compare_types = {npy_bfloat16_, npy_bfloat16_,
-                                            NPY_BOOL};
+  const std::array<int, 3> compare_types = {
+      {npy_bfloat16_, npy_bfloat16_, NPY_BOOL}};
 
   if (!register_ufunc("equal", CompareUFunc<Bfloat16EqFunctor>,
                       compare_types)) {
-- 
GitLab


From 1cd64d57143814fc0652c09165735be62d96124f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 13:56:55 -0700
Subject: [PATCH 247/450] Track dependencies between outside_compilation
 clusters so that control edges can be correctly added to sequence compiled
 computations.

PiperOrigin-RevId: 193718295
---
 .../jit/encapsulate_subgraphs_pass.cc         | 378 ++++++++++-
 .../jit/encapsulate_subgraphs_pass_test.cc    | 590 +++++++++++++++++-
 tensorflow/compiler/tf2xla/xla_compiler.cc    |  25 +
 tensorflow/compiler/tf2xla/xla_compiler.h     |  20 +
 4 files changed, 1005 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 9465385b58..7507e193b5 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/jit/graph_to_functiondef.h"
+#include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/legacy_flags/encapsulate_subgraphs_pass_flags.h"
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
@@ -160,6 +161,11 @@ class Encapsulator {
             std::move(outside_compilation_attribute)),
         graph_in_(graph_in) {}
 
+  // Find dependencies between subgraphs and outside_compilation clusters that
+  // only manifest via edges between outside_compilation clusters in the outer
+  // (non-compiled) graph.
+  Status FindClusterDependencies();
+
   // Find subgraphs marked with 'group_attribute', and build a new
   // subgraph, one for each value of 'group_attribute'.
   Status SplitIntoSubgraphs();
@@ -230,6 +236,19 @@ class Encapsulator {
   // the shapes of any ancestor RAH outputs. If it can be determined that the
   // shape of the SFH inputs will not be inferrable even once the shapes of the
   // RAH outputs are known, an error is returned by the rewriter.
+  //
+  // Once edges between compiled and outside_compilation clusters have been
+  // replaced by send/recv ops, some dependencies may no longer be apparent.
+  // A clustering pass finds all the dependencies between HC nodes that are only
+  // present as a result of edges between nodes in outside_compilaton clusters.
+  // Suppose there is a path from outside_compilation cluster C in subgraph S
+  // to outside_compilation cluster D in subgraph T. If S != T then a control
+  // edge is added from the call node for S to the call node for T, which
+  // ensures that C will execute before D because S executes before T. If S==T
+  // then a control dependency is added between the HC nodes for C and D in S,
+  // and the HC node for C is added to an 'ancestors' attr in the HC node for D
+  // so that during compilation of the HC node for D, an XLA control dependency
+  // can be added to ensure C's SendToHost executes before D's RecvFromHost.
   class Subgraph {
    public:
     // Creates a graph to build the subgraph in, if it doesn't already exist,
@@ -324,6 +343,18 @@ class Encapsulator {
     void RecordOutsideCompilationOutputOrControl(
         const string& outside_compilation_id, const Edge* edge);
 
+    // Records the fact that there is a path from a node in outside_compilation
+    // cluster ancestor to node in cluster successor that does not go through
+    // the subgraph.
+    void RecordOutsideCompilationDependency(const string& successor,
+                                            const string& ancestor);
+
+    // Returns the mapping from outside_compilation cluster C to the set of
+    // outside_compilation clusters that have a path to C entirely outside
+    // compiled subgraphs.
+    const std::unordered_map<string, std::unordered_set<string>>
+    OutsideCompilationAncestorMap() const;
+
     // Adds the HostCompute nodes for each outside_compilation subgraph.
     Status AddHostComputes(
         const string& subgraph_name,
@@ -406,6 +437,13 @@ class Encapsulator {
     Status AddHostComputeKeyPlaceholder(OutsideCompilationSubgraph* oc_subgraph,
                                         Graph* graph_out);
 
+    // Get the set of outside_compilation clusters and the dependency edges
+    // between them.
+    void GetActiveClusterDependencyGraph(
+        std::unordered_set<string>* clusters,
+        std::unordered_set<string>* has_successor,
+        std::unordered_map<string, std::unordered_set<string>>* ancestors_map);
+
     // Builds a _RecvAtHost node producing all the inputs of an
     // outside_compilation subgraph and stores it in oc_subgraph.recv_at_host.
     Status AddRecvAtHostNode(const string& group_attribute,
@@ -468,6 +506,14 @@ class Encapsulator {
     // The outside_compilation clusters in this subgraph.
     std::unordered_map<string, OutsideCompilationSubgraph>
         outside_compilation_subgraphs_;
+    // For each outside_compilation cluster C, the outside_compilation clusters
+    // that have a path to C outside the compiled graph.
+    std::unordered_map<string, std::unordered_set<string>>
+        outside_compilation_ancestors_;
+    // For each outside_compilation cluster C, the outside_compilation clusters
+    // that have a path from C outside the compiled graph.
+    std::unordered_map<string, std::unordered_set<string>>
+        outside_compilation_successors_;
 
     // NoOp node in the output graph that is sequenced after the call node and
     // used to prevent host-side outside_compilation sends and recvs from being
@@ -556,6 +602,10 @@ class Encapsulator {
       std::unordered_set<std::pair<NodeSlot, NodeSlot>, NodeSlot::PairHasher>*
           edges_added);
 
+  // Adds control dependencies between subgraph call nodes that have
+  // dependencies via outside_compilation edges.
+  Status AddCallNodeDependencies(Graph* graph_out);
+
   // Adds all edges to the output graph.
   Status AddEdgesToOutputGraph(
       const std::unordered_map<const Node*, Node*>& node_images,
@@ -620,10 +670,65 @@ class Encapsulator {
   const Graph* graph_in_;
 
   std::unordered_map<string, Subgraph> subgraphs_;
+  // For each subgraph S the subgraphs S' such that there is a path in some
+  // outside_compilation cluster C in S to some outside_compilation cluster C'
+  // in S', that goes only through the uncompiled graph.
+  std::unordered_map<string, std::unordered_set<string>> subgraph_ancestors_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Encapsulator);
 };
 
+namespace {
+
+// Return in 'sorted' a topological sort of clusters according to the
+// dependencies encoded in ancestors. clusters is the list of all clusters
+// including clusters that are not present in the ancestors map. has_successors
+// is the set of clusters that are ancestors of some other cluster.
+void TopologicalClusterSort(
+    const std::unordered_set<string>& clusters,
+    const std::unordered_set<string>& has_successors,
+    const std::unordered_map<string, std::unordered_set<string>>& ancestors,
+    std::vector<string>* sorted) {
+  // The nodes are placed in 'sorted' in topological order.
+  sorted->clear();
+  // We don't use the standard DFS because we are not operating on Node*
+  // objects.
+  struct Work {
+    string cluster;
+    bool leave;
+  };
+  std::set<string> visited;
+  std::vector<Work> stack;
+  // Seed the processing list with clusters that have no successors.
+  for (const auto& cluster : clusters) {
+    if (has_successors.find(cluster) == has_successors.end()) {
+      stack.push_back({cluster, false});
+    }
+  }
+  while (!stack.empty()) {
+    const Work item = stack.back();
+    stack.pop_back();
+    if (item.leave) {
+      sorted->push_back(item.cluster);
+      continue;
+    }
+
+    if (visited.find(item.cluster) != visited.end()) continue;
+    visited.insert(item.cluster);
+
+    stack.push_back({item.cluster, true});
+    const auto& iter = ancestors.find(item.cluster);
+    if (iter != ancestors.end()) {
+      for (const auto& ancestor : iter->second) {
+        stack.push_back({ancestor, false});
+      }
+    }
+  }
+  CHECK(sorted->size() == clusters.size());
+}
+
+}  // namespace
+
 Node* Encapsulator::Subgraph::GetCallNodeForInputs() const {
   return call_node_inputs_;
 }
@@ -786,12 +891,71 @@ void Encapsulator::Subgraph::RecordOutsideCompilationOutputOrControl(
   }
 }
 
+void Encapsulator::Subgraph::RecordOutsideCompilationDependency(
+    const string& successor, const string& ancestor) {
+  outside_compilation_ancestors_[successor].insert(ancestor);
+  outside_compilation_successors_[ancestor].insert(successor);
+}
+
+const std::unordered_map<string, std::unordered_set<string>>
+Encapsulator::Subgraph::OutsideCompilationAncestorMap() const {
+  return outside_compilation_ancestors_;
+}
+
+void Encapsulator::Subgraph::GetActiveClusterDependencyGraph(
+    std::unordered_set<string>* clusters,
+    std::unordered_set<string>* has_successor,
+    std::unordered_map<string, std::unordered_set<string>>* ancestors_map) {
+  // During initial clustering the ancestor and successor datastructures may
+  // have been built including oc_cluster names that never turned into subgraphs
+  // because they had no edges into or out of the compiled cluster. Remove them
+  // before proceeding to simplify the logic. Get the set of clusters that was
+  // actually added, then remove references to the others.
+  for (const auto& oc_subgraph : outside_compilation_subgraphs_) {
+    clusters->insert(oc_subgraph.first);
+  }
+  for (const auto& cluster : outside_compilation_successors_) {
+    if (clusters->find(cluster.first) != clusters->end()) {
+      for (const auto& successor : cluster.second) {
+        if (clusters->find(successor) != clusters->end()) {
+          has_successor->insert(cluster.first);
+          break;
+        }
+      }
+    }
+  }
+  for (const auto& cluster : outside_compilation_ancestors_) {
+    if (clusters->find(cluster.first) != clusters->end()) {
+      std::unordered_set<string>& ancestors = (*ancestors_map)[cluster.first];
+      for (const auto& ancestor : cluster.second) {
+        if (clusters->find(ancestor) != clusters->end()) {
+          ancestors.insert(ancestor);
+        }
+      }
+    }
+  }
+}
+
 Status Encapsulator::Subgraph::AddHostComputes(
     const string& subgraph_name,
     const std::unordered_map<const Node*, Node*>& node_images) {
-  for (auto& oc_subgraph_iter : outside_compilation_subgraphs_) {
-    const string& oc_subgraph_name = oc_subgraph_iter.first;
-    OutsideCompilationSubgraph& oc_subgraph = oc_subgraph_iter.second;
+  // Get the set of outside_compilation clusters and the dependency edges
+  // between them.
+  std::unordered_set<string> clusters;
+  std::unordered_set<string> has_successor;
+  std::unordered_map<string, std::unordered_set<string>> ancestors_map;
+  GetActiveClusterDependencyGraph(&clusters, &has_successor, &ancestors_map);
+  // Topologically sort the outside_compilation clusters according to their
+  // dependency relation.
+  std::vector<string> sorted_clusters;
+  TopologicalClusterSort(clusters, has_successor, ancestors_map,
+                         &sorted_clusters);
+
+  // The host compute nodes added for each outside_compilation_cluster;
+  std::unordered_map<string, Node*> host_compute_node;
+  for (const string& oc_subgraph_name : sorted_clusters) {
+    OutsideCompilationSubgraph& oc_subgraph =
+        outside_compilation_subgraphs_[oc_subgraph_name];
     if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty() ||
         !oc_subgraph.outputs_by_src.empty() ||
         !oc_subgraph.control_outputs.empty()) {
@@ -811,13 +975,22 @@ Status Encapsulator::Subgraph::AddHostComputes(
         inputs[input_index].Reset(src_image->name(), src_slot, dtype);
         input_dtypes[input_index] = dtype;
       }
-
       for (const auto& output : oc_subgraph.outputs_by_src) {
         DataType dtype = output.first.dtype;
         int output_index = output.second;
         output_dtypes[output_index] = dtype;
       }
 
+      std::vector<string> host_compute_ancestors;
+      const auto iter = ancestors_map.find(oc_subgraph_name);
+      if (iter != ancestors_map.end()) {
+        for (const string& ancestor_cluster : iter->second) {
+          host_compute_ancestors.push_back(
+              outside_compilation_subgraphs_[ancestor_cluster]
+                  .host_compute_name);
+        }
+      }
+
       NodeDef host_compute_def;
       NodeDefBuilder builder(strings::StrCat("outside_compilation_",
                                              oc_subgraph_name, "_host_compute"),
@@ -825,6 +998,7 @@ Status Encapsulator::Subgraph::AddHostComputes(
       builder.Input(inputs);
       builder.Attr("Tinputs", input_dtypes);
       builder.Attr("Toutputs", output_dtypes);
+      builder.Attr("ancestors", host_compute_ancestors);
       builder.Attr("key",
                    strings::StrCat("host_compute_channel_", subgraph_name, "_",
                                    oc_subgraph_name));
@@ -834,6 +1008,7 @@ Status Encapsulator::Subgraph::AddHostComputes(
 
       Node* host_compute = graph_->AddNode(host_compute_def, &s);
       if (!s.ok()) return s;
+      host_compute_node[host_compute->name()] = host_compute;
       oc_subgraph.host_compute_name = host_compute->name();
 
       // Connect the _HostCompute node to its producers in the subgraph.
@@ -852,6 +1027,12 @@ Status Encapsulator::Subgraph::AddHostComputes(
         graph_->AddControlEdge(src_image, host_compute);
       }
 
+      // Connect the _HostCompute node to its ancestor host compute nodes.
+      for (const auto& ancestor_name : host_compute_ancestors) {
+        Node* ancestor = host_compute_node[ancestor_name];
+        graph_->AddControlEdge(ancestor, host_compute);
+      }
+
       // Connect the consumers in the subgraph to the _HostCompute node.
       for (const auto& output : oc_subgraph.outputs_by_dst) {
         const Node* dst_node = output.first.node;
@@ -1654,6 +1835,17 @@ Status Encapsulator::CopyEdgeToOutputGraph(
   return Status::OK();
 }
 
+Status Encapsulator::AddCallNodeDependencies(Graph* graph_out) {
+  for (const auto& ancestors : subgraph_ancestors_) {
+    const string& subgraph = ancestors.first;
+    for (const string& ancestor : ancestors.second) {
+      graph_out->AddControlEdge(subgraphs_[ancestor].GetCallNodeForOutputs(),
+                                subgraphs_[subgraph].GetCallNodeForInputs());
+    }
+  }
+  return Status::OK();
+}
+
 Status Encapsulator::AddEdgesToOutputGraph(
     const std::unordered_map<const Node*, Node*>& node_images,
     bool parallel_checking, Graph* graph_out) {
@@ -1703,6 +1895,7 @@ Status Encapsulator::AddEdgesToOutputGraph(
     Subgraph& subgraph = subgraph_entry.second;
     subgraph.ConnectSequencerToCallNode(graph_out);
   }
+  TF_RETURN_IF_ERROR(AddCallNodeDependencies(graph_out));
 
   return Status::OK();
 }
@@ -1960,6 +2153,182 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
   return Status::OK();
 }
 
+namespace {
+
+// Helper struct for building cluster dependencies and also debugging cycles in
+// the dependencies. While computing dependencies we construct a mapping from
+// Node* to PathDetails.
+struct PathDetails {
+  struct SubgraphAndCluster {
+    string subgraph;
+    string outside_compilation_cluster;
+    bool operator==(const SubgraphAndCluster& other) const {
+      return subgraph == other.subgraph &&
+             outside_compilation_cluster == other.outside_compilation_cluster;
+    }
+  };
+
+  struct SubgraphAndClusterHash {
+    inline std::size_t operator()(const SubgraphAndCluster& v) const {
+      return hash<string>()(
+          strings::StrCat(v.subgraph, v.outside_compilation_cluster));
+    }
+  };
+
+  typedef std::unordered_set<SubgraphAndCluster, SubgraphAndClusterHash>
+      SubgraphAndClusterSet;
+
+  // Returns the set of (subgraph, oc_cluster) pairs that should be recorded as
+  // ancestors for any successor of this node. If the node is in the outer
+  // graph, it returns the transitive union of the ancestors of the node's
+  // inputs. If the node is in an outside_compilation cluster, it returns just
+  // that cluster. If the node is compiled, it returns the empty set.
+  SubgraphAndClusterSet AncestorsForSuccessor() {
+    if (subgraph.empty()) {
+      return ancestor_clusters;
+    } else if (outside_compilation_cluster.empty()) {
+      return SubgraphAndClusterSet();
+    } else {
+      SubgraphAndCluster entry;
+      entry.subgraph = subgraph;
+      entry.outside_compilation_cluster = outside_compilation_cluster;
+      return SubgraphAndClusterSet({entry});
+    }
+  }
+
+  // The transitive union of the ancestor's of this node's inputs. This is only
+  // saved for debugging in order to print out enough information to debug a
+  // discovered cycle.
+  SubgraphAndClusterSet ancestor_clusters;
+  // The subgraph attr on this node.
+  string subgraph;
+  // The outside_compilation attr on this node.
+  string outside_compilation_cluster;
+};
+
+// Adds an edge from ancestor to successor to the cycle detector, and returns an
+// error if that edge causes the formation of a cycle. In the error case, logs
+// the contents of the node_ancestors_map to facilitate debugging.
+Status CheckClusterDependencyForCycles(
+    const string& ancestor, const string& successor,
+    const std::unordered_map<string, std::unordered_set<string>>& ancestors,
+    const std::unordered_map<Node*, PathDetails>& node_ancestors_map,
+    GraphCycles* cycle_detector, std::map<string, int>* cycle_detector_map) {
+  if (cycle_detector_map->find(ancestor) == cycle_detector_map->end()) {
+    (*cycle_detector_map)[ancestor] = cycle_detector->NewNode();
+  }
+  if (cycle_detector_map->find(successor) == cycle_detector_map->end()) {
+    (*cycle_detector_map)[successor] = cycle_detector->NewNode();
+  }
+
+  if (!cycle_detector->InsertEdge((*cycle_detector_map)[ancestor],
+                                  (*cycle_detector_map)[successor])) {
+    LOG(ERROR) << "Cycle in outside_compilation clusters";
+    for (const auto& cluster : ancestors) {
+      LOG(ERROR) << "Cluster " << cluster.first << " depends on:";
+      for (const auto& ancestor : cluster.second) {
+        LOG(ERROR) << "  " << ancestor;
+      }
+    }
+    for (const auto& node_ancestors : node_ancestors_map) {
+      LOG(ERROR) << "Node " << node_ancestors.first->name() << " ("
+                 << node_ancestors.second.subgraph << ";"
+                 << node_ancestors.second.outside_compilation_cluster
+                 << ") has ancestor clusters:";
+      for (const auto& ancestor : node_ancestors.second.ancestor_clusters) {
+        LOG(ERROR) << "  " << ancestor.subgraph << ";"
+                   << ancestor.outside_compilation_cluster;
+      }
+    }
+    return errors::InvalidArgument(
+        "Can't compile outside_compilation clusters because there is a "
+        "dependency cycle: see error log for details.");
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status Encapsulator::FindClusterDependencies() {
+  // Map from nodes to ancestor details. A node is entered into the map if it is
+  // in a compilation subgraph, and outside_compilation cluster, or appears on a
+  // path in the outer graph leading from an outside_compilation subgraph.
+  std::unordered_map<Node*, PathDetails> node_ancestors_map;
+  // We check that clusters are acyclic using this cycle detector.
+  GraphCycles cycle_detector;
+  // Map from cluster name to cycle detector node id.
+  std::map<string, int> cycle_detector_map;
+  // Process the nodes in topologically-sorted order.
+  std::vector<Node*> nodes;
+  GetReversePostOrder(*graph_in_, &nodes);
+  for (Node* node : nodes) {
+    string subgraph_name;
+    string oc_cluster;
+    TF_RETURN_IF_ERROR(GetFunctionNameAttr(node, &subgraph_name, &oc_cluster));
+    // First create an entry in the ancestors map if the node is in a compiled
+    // subgraph or outside_compilation cluster, or if any incoming edge is from
+    // a node with an ancestor map entry; and find the union of all the
+    // ancestors.
+    if (!subgraph_name.empty()) {
+      node_ancestors_map[node].subgraph = subgraph_name;
+      node_ancestors_map[node].outside_compilation_cluster = oc_cluster;
+    }
+    for (Node* src : node->in_nodes()) {
+      const auto iter = node_ancestors_map.find(src);
+      if (iter != node_ancestors_map.end()) {
+        const auto& ancestors_to_follow = iter->second.AncestorsForSuccessor();
+        for (const auto& ancestor : ancestors_to_follow) {
+          if (ancestor.subgraph != subgraph_name ||
+              ancestor.outside_compilation_cluster != oc_cluster) {
+            node_ancestors_map[node].ancestor_clusters.insert(ancestor);
+          }
+        }
+      }
+    }
+    if (!subgraph_name.empty()) {
+      // The node is in a compiled subgraph or an outside_compilation cluster.
+      if (oc_cluster.empty()) {
+        // The node is not in an outside_compilation cluster. Record the
+        // subgraph's ancestor dependencies.
+        for (const auto& cluster : node_ancestors_map[node].ancestor_clusters) {
+          if (cluster.subgraph != subgraph_name) {
+            subgraph_ancestors_[subgraph_name].insert(cluster.subgraph);
+            TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles(
+                cluster.subgraph, subgraph_name, subgraph_ancestors_,
+                node_ancestors_map, &cycle_detector, &cycle_detector_map));
+          }
+        }
+      } else {
+        Subgraph& subgraph = subgraphs_[subgraph_name];
+        // The node is in an outside_compilation cluster. Record the cluster
+        // and/or subgraph ancestor dependencies.
+        for (const auto& cluster : node_ancestors_map[node].ancestor_clusters) {
+          if (cluster.subgraph == subgraph_name) {
+            // The ancestor is in the same subgraph.
+            if (cluster.outside_compilation_cluster != oc_cluster) {
+              // But not in the same oc_cluster, so record the dependency.
+              subgraph.RecordOutsideCompilationDependency(
+                  oc_cluster, cluster.outside_compilation_cluster);
+              TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles(
+                  cluster.outside_compilation_cluster, oc_cluster,
+                  subgraph.OutsideCompilationAncestorMap(), node_ancestors_map,
+                  &cycle_detector, &cycle_detector_map));
+            }
+          } else {
+            // The ancestor is in a different subgraph, so record the
+            // dependency.
+            subgraph_ancestors_[subgraph_name].insert(cluster.subgraph);
+            TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles(
+                cluster.subgraph, subgraph_name, subgraph_ancestors_,
+                node_ancestors_map, &cycle_detector, &cycle_detector_map));
+          }
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
 Status Encapsulator::MakePrunedGraphCopyAndInline(
     const Graph& graph, const std::vector<Node*>& sink_nodes,
     std::unique_ptr<Graph>* pruned_graph,
@@ -2166,6 +2535,7 @@ Status EncapsulateSubgraphsInFunctions(
   Encapsulator encapsulator(std::move(group_attribute),
                             std::move(outside_compilation_attribute),
                             &graph_in);
+  TF_RETURN_IF_ERROR(encapsulator.FindClusterDependencies());
   TF_RETURN_IF_ERROR(encapsulator.SplitIntoSubgraphs());
 
   TF_RETURN_IF_ERROR(encapsulator.BuildFunctionDefs(
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 8599a7038a..3502d1bb45 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -74,7 +74,7 @@ bool EqualProtoMap(const ::tensorflow::protobuf::Map<Tkey, Tvalue>& a,
     if (!compare(elt_a.first, elt_a.second, iter->second)) {
       if (diff) {
         *diff = strings::StrCat(map_name, " expected: element with key '",
-                                key_to_string(elt_a.first), " has value '",
+                                key_to_string(elt_a.first), "' has value '",
                                 value_to_string(elt_a.second), "' got: '",
                                 value_to_string(iter->second), "'");
       }
@@ -121,8 +121,22 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
     }
     return false;
   }
+  std::unordered_set<string> control_input_a;
+  std::unordered_set<string> control_input_b;
   for (int i = 0; i < a.input_size(); ++i) {
-    if (a.input(i) != b.input(i)) {
+    if (str_util::StartsWith(a.input(i), "^")) {
+      if (!str_util::StartsWith(b.input(i), "^")) {
+        if (diff) {
+          *diff = strings::StrCat(
+              diff_preamble, " mismatch for node ", a.name(), " input ", i,
+              ", expected control input ", a.input(i), " got ", b.input(i),
+              " expected:\n", a.DebugString(), "\ngot:\n", b.DebugString());
+        }
+        return false;
+      }
+      control_input_a.insert(a.input(i));
+      control_input_b.insert(b.input(i));
+    } else if (a.input(i) != b.input(i)) {
       if (diff) {
         *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
                                 " input ", i, ", expected ", a.input(i),
@@ -132,11 +146,29 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
       return false;
     }
   }
+  if (control_input_a != control_input_b) {
+    if (diff) {
+      *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
+                              " control inputs differ expected:\n",
+                              a.DebugString(), "\ngot:\n", b.DebugString());
+    }
+    return false;
+  }
   return EqualProtoMap<string, AttrValue>(
       a.attr(), b.attr(), [](const string& s) { return s; },
       [](const AttrValue& v) { return v.DebugString(); },
       [](const string& key, const AttrValue& av, const AttrValue& bv) {
-        return av.DebugString() == bv.DebugString();
+        if (key == "ancestors") {
+          // The ancestors are added from a set so the order is unpredictable;
+          // just compare set equality not list equality.
+          std::unordered_set<string> a_set(av.list().s().begin(),
+                                           av.list().s().end());
+          std::unordered_set<string> b_set(bv.list().s().begin(),
+                                           bv.list().s().end());
+          return a_set == b_set;
+        } else {
+          return av.DebugString() == bv.DebugString();
+        }
       },
       strings::StrCat(diff_preamble, " attr mismatch for node ", a.name()),
       diff);
@@ -261,6 +293,7 @@ REGISTER_OP("XlaHostCompute")
     .Output("outputs: Toutputs")
     .Attr("Tinputs: list(type) >= 0")
     .Attr("Toutputs: list(type) >= 0")
+    .Attr("ancestors: list(string) >= 0")
     .Attr("key: string")
     .Attr("shape_inference_graph: string = ''")
     .Attr("shapes: list(shape) >= 0")
@@ -899,6 +932,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
            {"C:o:0", "c:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O1"},
@@ -1044,17 +1078,20 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
            {"D:o:0", "F:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors",
+             gtl::ArraySlice<string>({"outside_compilation_O1_host_compute"})},
             {"key", "host_compute_channel_F1_O2"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O2"},
             {"shapes", gtl::ArraySlice<DataType>({})},
             {"_outside_compilation_subgraph", "O2"}},
-           {"F"}},
+           {"F", "outside_compilation_O1_host_compute"}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
            {"C:o:0", "D:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O1"},
@@ -1193,6 +1230,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
            {"C:o:0", "D:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O1"},
@@ -1215,6 +1253,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
            {"G:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F2_O1"},
             {"shape_inference_graph", ""},
             {"shapes",
@@ -1279,6 +1318,179 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
   TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
 }
 
+// Test with two functions to transform, each with one outside_compilation
+// cluster, with the dependency between them purely from an outside_compilation
+// edge.
+TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = InputShaped(b1.opts().WithName("A"));
+    Node* b = InputShaped(b1.opts().WithName("B"));
+    Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
+    Node* d =
+        Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
+    Node* e = Binary(c, d,
+                     b1.opts()
+                         .WithName("E")
+                         .WithControlInputs({b, d})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* f = Binary(c, e,
+                     b1.opts().WithName("F").WithControlInput(e).WithAttr(
+                         "_encapsulate", "F1"));
+    Node* g =
+        Binary(a, b, b1.opts().WithName("G").WithAttr("_encapsulate", "F2"));
+    Node* h = Unary(g, b1.opts()
+                           .WithName("H")
+                           .WithAttr("_encapsulate", "F2")
+                           .WithAttr("_outside", "O1")
+                           .WithControlInput(e));
+    Node* i = Unary(h, b1.opts().WithName("I").WithAttr("_encapsulate", "F2"));
+    Binary(f, i, b1.opts().WithName("J"));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  {
+    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
+    Node* key_constant =
+        KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT, DT_FLOAT}, shape.opts());
+    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
+                     shape.opts()
+                         .WithName("E")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected));
+  }
+
+  {
+    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
+    Node* key_constant =
+        KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F2", "O1",
+                            {DT_FLOAT}, shape.opts());
+    Node* h = Unary(recv, shape.opts()
+                              .WithName("H")
+                              .WithAttr("_encapsulate", "F2")
+                              .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F2", "O1", {h}, shape.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape, "F2_O1", &library_expected));
+  }
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
+      {
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
+          {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
+          {{"F"},
+           "BinaryTest",
+           {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"},
+           {},
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O1_host_compute"},
+           "XlaHostCompute",
+           {"C:o:0", "D:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F1_O1"},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"}},
+           {"D"}},
+      },
+      {{"f_0_retval", "F:o:0"}});
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F2", {"a_0_arg:float", "b_0_arg:float"}, {"i_0_retval:float"}, {},
+      {
+          {{"G"}, "BinaryTest", {"a_0_arg", "b_0_arg"}},
+          {{"I"},
+           "UnaryTest",
+           {"outside_compilation_O1_host_compute:outputs:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "XlaHostCompute",
+           {"G:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
+            {"key", "host_compute_channel_F2_O1"},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F2_O1"},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"}}},
+      },
+      {{"i_0_retval", "I:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = InputShaped(b2.opts().WithName("A"));
+    Node* b = InputShaped(b2.opts().WithName("B"));
+
+    Node* key_constant1 =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant1, 0), "F1", "O1",
+                             {DT_FLOAT, DT_FLOAT}, b2.opts());
+    Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
+                     b2.opts()
+                         .WithName("E")
+                         .WithControlInputs({recv1, b})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* send1 = SendFromHost(ops::NodeOut(key_constant1, 0), "F1", "O1", {e},
+                               b2.opts().WithControlInput(e));
+    Node* s1 = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}),
+        "F1");
+
+    NodeBuilder node_builder1("F1", "F1", lib_def.get());
+    node_builder1.Input(a).Input(b);
+    Node* call1 =
+        b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
+
+    Node* key_constant2 =
+        KeyPlaceholder("F2", b2.opts().WithName("F2_key_placeholder"));
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant2, 0), "F2", "O1",
+                             {DT_FLOAT}, b2.opts());
+    Node* h = Unary(recv2, b2.opts()
+                               .WithName("H")
+                               .WithAttr("_encapsulate", "F2")
+                               .WithAttr("_outside", "O1")
+                               .WithControlInput(e));
+    Node* send2 = SendFromHost(ops::NodeOut(key_constant2, 0), "F2", "O1", {h},
+                               b2.opts());
+
+    Node* s2 = Sequencer(
+        b2.opts().WithName("F2_sequencer").WithControlInputs({recv2, send2}),
+        "F2");
+    NodeBuilder node_builder2("F2", "F2", lib_def.get());
+    node_builder2.Input(a).Input(b);
+    Node* call2 = b2.opts()
+                      .WithControlInputs({s2, call1})
+                      .FinalizeBuilder(&node_builder2);
+    Binary(call1, call2, b2.opts().WithName("J"));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
 // Test with one outside_compilation cluster that has no inputs from the
 // compiled subgraph.
 TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
@@ -1323,6 +1535,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
            {},
            {{"Tinputs", gtl::ArraySlice<DataType>({})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
             {"shapes",
@@ -1406,6 +1619,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
            {},
            {{"Tinputs", gtl::ArraySlice<DataType>({})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
             {"shapes",
@@ -1487,6 +1701,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
            {"D:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
             {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
@@ -1567,6 +1782,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
            {"D:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
             {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
@@ -1607,6 +1823,371 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
   TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
 }
 
+// Test with two outside_compilation clusters that interact outside the compiled
+// subgraph, where the ancestor has no HostCompute Op.
+TEST(EncapsulateSubgraphsTest,
+     OutsideCompilationClusterDependencyNoSrcCluster) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = Input(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
+    Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
+    Node* d =
+        Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
+    Node* e = Unary(a, b1.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
+    Node* f = Unary(d, b1.opts().WithName("F").WithAttr("_encapsulate", "F1"));
+    Node* g = Unary(f, b1.opts()
+                           .WithName("G")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O2")
+                           .WithControlInput(e));
+    Node* h = Unary(g, b1.opts().WithName("H").WithAttr("_encapsulate", "F1"));
+    Binary(e, h, b1.opts().WithName("I"));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  {
+    GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately);
+    Node* key_constant =
+        KeyPlaceholderShape(shape2.opts().WithName("KnownShape/_0"));
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
+                             {DT_FLOAT}, shape2.opts());
+    Node* g = Unary(ops::NodeOut(recv2, 0), shape2.opts()
+                                                .WithName("G")
+                                                .WithAttr("_encapsulate", "F1")
+                                                .WithAttr("_outside", "O2"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g}, shape2.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape2, "F1_O2", &library_expected));
+  }
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {},
+      {
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
+          {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
+          {{"F"}, "UnaryTest", {"D:o:0"}},
+          {{"H"},
+           "UnaryTest",
+           {"outside_compilation_O2_host_compute:outputs:0"}},
+          {{"outside_compilation_O2_host_compute"},
+           "XlaHostCompute",
+           {"F:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
+            {"key", "host_compute_channel_F1_O2"},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F1_O2"},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O2"}}},
+      },
+      {{"h_0_retval", "H:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = Input(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
+
+    Node* e = Unary(a, b2.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
+    Node* key_constant =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
+                            {DT_FLOAT}, b2.opts());
+    Node* g = Unary(recv, b2.opts()
+                              .WithName("G")
+                              .WithAttr("_encapsulate", "F1")
+                              .WithAttr("_outside", "O2")
+                              .WithControlInput(e));
+    Node* send =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g}, b2.opts());
+    Node* s1 = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
+        "F1");
+    NodeBuilder node_builder1("F1", "F1", lib_def.get());
+    node_builder1.Input(a).Input(b).ControlInput(s1);
+    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
+
+    Binary(e, call1, b2.opts().WithName("I"));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
+// Test with two outside_compilation clusters that interact outside the compiled
+// subgraph, where the successor has no HostCompute Op.
+TEST(EncapsulateSubgraphsTest,
+     OutsideCompilationClusterDependencyNoDstCluster) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = Input(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
+    Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
+    Node* d =
+        Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
+    Node* e = Unary(d, b1.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
+    Node* f = Unary(e, b1.opts().WithName("F").WithAttr("_encapsulate", "F1"));
+    /*Node* g =*/Unary(a, b1.opts()
+                              .WithName("G")
+                              .WithAttr("_encapsulate", "F1")
+                              .WithAttr("_outside", "O2")
+                              .WithControlInput(e));
+    Node* h = Unary(f, b1.opts().WithName("H").WithAttr("_encapsulate", "F1"));
+    Binary(e, h, b1.opts().WithName("I"));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  {
+    GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
+    Node* key_constant =
+        KeyPlaceholderShape(shape1.opts().WithName("KnownShape/_0"));
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT}, shape1.opts());
+    Node* e = Unary(ops::NodeOut(recv2, 0), shape1.opts()
+                                                .WithName("E")
+                                                .WithAttr("_encapsulate", "F1")
+                                                .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
+  }
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {},
+      {
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
+          {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
+          {{"F"},
+           "UnaryTest",
+           {"outside_compilation_O1_host_compute:outputs:0"}},
+          {{"H"}, "UnaryTest", {"F:o:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "XlaHostCompute",
+           {"D:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F1_O1"},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"}}},
+      },
+      {{"h_0_retval", "H:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = Input(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
+
+    Node* key_constant =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT}, b2.opts());
+    Node* e = Unary(recv, b2.opts()
+                              .WithName("E")
+                              .WithAttr("_encapsulate", "F1")
+                              .WithAttr("_outside", "O1"));
+    Node* send =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
+    /*Node* g =*/Unary(a, b2.opts()
+                              .WithName("G")
+                              .WithAttr("_encapsulate", "F1")
+                              .WithAttr("_outside", "O2")
+                              .WithControlInput(e));
+    Node* s1 = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
+        "F1");
+    NodeBuilder node_builder1("F1", "F1", lib_def.get());
+    node_builder1.Input(a).Input(b).ControlInput(s1);
+    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
+
+    Binary(e, call1, b2.opts().WithName("I"));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
+// Test with two outside_compilation clusters that interact outside the compiled
+// subgraph.
+TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = Input(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
+    Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
+    Node* d =
+        Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
+    Node* e = Unary(d, b1.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
+    Node* f = Unary(e, b1.opts().WithName("F").WithAttr("_encapsulate", "F1"));
+    Node* g = Unary(d, b1.opts()
+                           .WithName("G")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O2")
+                           .WithControlInput(e));
+    Node* h = Unary(f, b1.opts().WithName("H").WithAttr("_encapsulate", "F1"));
+    /*Node* i =*/Binary(d, e,
+                        b1.opts()
+                            .WithName("I")
+                            .WithAttr("_encapsulate", "F1")
+                            .WithAttr("_outside", "O3")
+                            .WithControlInput(g));
+    Binary(e, h, b1.opts().WithName("J"));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  {
+    GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
+    Node* key_constant =
+        KeyPlaceholderShape(shape1.opts().WithName("KnownShape/_0"));
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT}, shape1.opts());
+    Node* e = Unary(ops::NodeOut(recv2, 0), shape1.opts()
+                                                .WithName("E")
+                                                .WithAttr("_encapsulate", "F1")
+                                                .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
+  }
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {},
+      {{{"C"}, "UnaryTest", {"a_0_arg"}},
+       {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
+       {{"F"}, "UnaryTest", {"outside_compilation_O1_host_compute:outputs:0"}},
+       {{"H"}, "UnaryTest", {"F:o:0"}},
+       {{"outside_compilation_O1_host_compute"},
+        "XlaHostCompute",
+        {"D:o:0"},
+        {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+         {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+         {"ancestors", gtl::ArraySlice<string>({})},
+         {"key", "host_compute_channel_F1_O1"},
+         {"shape_inference_graph",
+          "_outside_compilation_shape_inference_F1_O1"},
+         {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+         {"_outside_compilation_subgraph", "O1"}}},
+       {{"outside_compilation_O2_host_compute"},
+        "XlaHostCompute",
+        {"D:o:0"},
+        {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+         {"Toutputs", gtl::ArraySlice<DataType>({})},
+         {"ancestors",
+          gtl::ArraySlice<string>({"outside_compilation_O1_host_compute"})},
+         {"key", "host_compute_channel_F1_O2"},
+         {"shape_inference_graph", ""},
+         {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+         {"_outside_compilation_subgraph", "O2"}},
+        {"outside_compilation_O1_host_compute"}},
+       {{"outside_compilation_O3_host_compute"},
+        "XlaHostCompute",
+        {"D:o:0"},
+        {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+         {"Toutputs", gtl::ArraySlice<DataType>({})},
+         {"ancestors",
+          gtl::ArraySlice<string>({"outside_compilation_O1_host_compute",
+                                   "outside_compilation_O2_host_compute"})},
+         {"key", "host_compute_channel_F1_O3"},
+         {"shape_inference_graph", ""},
+         {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+         {"_outside_compilation_subgraph", "O3"}},
+        {"outside_compilation_O1_host_compute",
+         "outside_compilation_O2_host_compute"}}},
+      {{"h_0_retval", "H:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = Input(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
+
+    Node* key_constant =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT}, b2.opts());
+    Node* e = Unary(recv1, b2.opts()
+                               .WithName("E")
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O1"));
+    Node* send =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
+                             {DT_FLOAT}, b2.opts());
+    Node* g = Unary(recv2, b2.opts()
+                               .WithName("G")
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O2")
+                               .WithControlInput(e));
+    Node* recv3 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O3",
+                             {DT_FLOAT}, b2.opts());
+    /*Node* i =*/Binary(recv3, e,
+                        b2.opts()
+                            .WithName("I")
+                            .WithAttr("_encapsulate", "F1")
+                            .WithAttr("_outside", "O3")
+                            .WithControlInput(g));
+    Node* s1 = Sequencer(b2.opts()
+                             .WithName("F1_sequencer")
+                             .WithControlInputs({recv1, send, recv2, recv3}),
+                         "F1");
+    NodeBuilder node_builder1("F1", "F1", lib_def.get());
+    node_builder1.Input(a).Input(b).ControlInput(s1);
+    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
+
+    Binary(e, call1, b2.opts().WithName("J"));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
 // Test with one outside_compilation cluster that has no outputs from the
 // compiled subgraph.
 TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
@@ -1731,6 +2312,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
            {"c:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O1"},
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 86263d847a..c0e9967684 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -813,4 +813,29 @@ Status XlaCompiler::SetHostToDeviceMetadata(
   return Status::OK();
 }
 
+Status XlaCompiler::GetHostComputeControlDependency(
+    const string& host_compute_name, xla::ComputationDataHandle* handle) {
+  const auto iter = host_compute_control_output_.find(host_compute_name);
+  if (iter == host_compute_control_output_.end()) {
+    return errors::InvalidArgument(
+        "No registered control handle for host compute Op '", host_compute_name,
+        "'");
+  } else {
+    *handle = iter->second;
+  }
+  return Status::OK();
+}
+
+Status XlaCompiler::SetHostComputeControlDependency(
+    const string& host_compute_name, const xla::ComputationDataHandle& handle) {
+  if (host_compute_control_output_.find(host_compute_name) !=
+      host_compute_control_output_.end()) {
+    return errors::InvalidArgument(
+        "Duplicate control handles registered for for host compute Op ",
+        host_compute_name);
+  }
+  host_compute_control_output_[host_compute_name] = handle;
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index a6747bbe72..8f564f35ec 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -325,6 +325,23 @@ class XlaCompiler {
                                  gtl::ArraySlice<DataType> types,
                                  gtl::ArraySlice<TensorShape> shapes);
 
+  // In order to avoid deadlocks from dependencies in host computations, it can
+  // be necessary to enforce a partial order on the execution of HostCompute
+  // Ops. In particular it may be necessary to constrain the SendToHost for one
+  // HostCompute to run before blocking on the RecvAtHost for another
+  // HostCompute. The compiler maintains a mapping from 'host_compute_name' to
+  // handle, where the handle is an 'output' of the HostCompute Op corresponding
+  // to 'host_compute_name'. Another HostCompute Op that needs to be sequenced
+  // later can add the handle as an 'input' to enforce the constraints.
+  // 'host_compute_name' can be any string the client wishes to use to identify
+  // a given HostCompute Op as long as the names are unique within the
+  // compilation.
+  Status GetHostComputeControlDependency(const string& host_compute_name,
+                                         xla::ComputationDataHandle* handle);
+  Status SetHostComputeControlDependency(
+      const string& host_compute_name,
+      const xla::ComputationDataHandle& handle);
+
   const Options& options() const { return options_; }
   xla::Client* client() const { return options_.client; }
   FunctionLibraryRuntime* flib_runtime() const { return flib_runtime_; }
@@ -391,6 +408,9 @@ class XlaCompiler {
   std::unordered_map<string, tf2xla::HostTransferMetadata> host_compute_sends_;
   std::unordered_map<string, tf2xla::HostTransferMetadata> host_compute_recvs_;
 
+  std::unordered_map<string, xla::ComputationDataHandle>
+      host_compute_control_output_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(XlaCompiler);
 };
 
-- 
GitLab


From d82d04f15992e224743f29aa75134ed04aa064a7 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 20 Apr 2018 13:58:51 -0700
Subject: [PATCH 248/450] Automated g4 rollback of changelist 193694958

PiperOrigin-RevId: 193718607
---
 .../core/distributed_runtime/master_session.cc     | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 1c67b42e76..ebe350d313 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -89,10 +89,6 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   ~ReffedClientGraph() override {
     if (should_deregister_) {
       DeregisterPartitions();
-    } else {
-      for (Part& part : partitions_) {
-        worker_cache_->ReleaseWorker(part.name, part.worker);
-      }
     }
   }
 
@@ -1178,8 +1174,14 @@ Status MasterSession::Create(GraphDef* graph_def,
     TF_RETURN_IF_ERROR(GraphExecutionState::MakeForBaseGraph(
         graph_def, execution_options, &execution_state_));
   }
-  should_delete_worker_sessions_ = true;
-  return CreateWorkerSessions(options);
+  // TODO(b/36574172): Remove these conditions when ClusterSpec
+  // propagation is supported in all servers.
+  if (options.cluster_def != nullptr ||
+      session_opts_.config.isolate_session_state()) {
+    should_delete_worker_sessions_ = true;
+    return CreateWorkerSessions(options);
+  }
+  return Status::OK();
 }
 
 Status MasterSession::CreateWorkerSessions(
-- 
GitLab


From 9fc5bacba49eb31c7d536963879ccc62ecfbaf76 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 14:25:57 -0700
Subject: [PATCH 249/450] Pin rbe-debian8-tf container tp a newer base image

- Also improve how numpy is installed (not compiling from source) for containers based on other distros than Ubuntu14.04

PiperOrigin-RevId: 193722848
---
 tensorflow/tools/ci_build/Dockerfile.rbe.cpu             | 2 +-
 .../tools/ci_build/install/install_pip_packages.sh       | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cpu b/tensorflow/tools/ci_build/Dockerfile.rbe.cpu
index 6f0798b1af..3bc52b9ed6 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cpu
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cpu
@@ -1,4 +1,4 @@
-FROM launcher.gcr.io/google/rbe-debian8:r322167
+FROM launcher.gcr.io/google/rbe-debian8:r327695
 LABEL maintainer="Yu Yi <yiyu@google.com>"
 
 # Copy install scripts
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 9644277fab..5aaf544afd 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -65,8 +65,13 @@ rm -rf /usr/lib/python3/dist-packages/six*
 # numpy needs to be installed from source to fix segfaults. See:
 # https://github.com/tensorflow/tensorflow/issues/6968
 # This workaround isn't needed for Ubuntu 16.04 or later.
-pip2 install --no-binary=:all: --upgrade numpy==1.12.0
-pip3 install --no-binary=:all: --upgrade numpy==1.12.0
+if $(cat /etc/*-release | grep -q 14.04); then
+  pip2 install --no-binary=:all: --upgrade numpy==1.12.0
+  pip3 install --no-binary=:all: --upgrade numpy==1.12.0
+else
+  pip2 install --upgrade numpy==1.12.0
+  pip3 install --upgrade numpy==1.12.0
+fi
 
 pip2 install scipy==0.18.1
 pip3 install scipy==0.18.1
-- 
GitLab


From 9f312f32091534bfc115212d2ec7c838180df663 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 14:30:48 -0700
Subject: [PATCH 250/450] Updating Generate Random Tensor to generate tensors
 whose values are small and do not cause overflow for arithmetic operations.

PiperOrigin-RevId: 193723661
---
 tensorflow/core/grappler/optimizers/BUILD      | 1 -
 tensorflow/core/grappler/utils/BUILD           | 1 +
 tensorflow/core/grappler/utils/grappler_test.h | 4 +++-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 3ab8d8f584..42c3580d40 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -112,7 +112,6 @@ tf_cc_test(
     name = "constant_folding_test",
     srcs = ["constant_folding_test.cc"],
     shard_count = 5,
-    tags = ["noasan"],
     deps = [
         ":constant_folding",
         "//tensorflow/cc:cc_ops",
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index b473f32c45..44ef4a965b 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -128,6 +128,7 @@ cc_library(
         "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core/grappler:grappler_item",
diff --git a/tensorflow/core/grappler/utils/grappler_test.h b/tensorflow/core/grappler/utils/grappler_test.h
index e1394b9c35..c2ba5ee7e8 100644
--- a/tensorflow/core/grappler/utils/grappler_test.h
+++ b/tensorflow/core/grappler/utils/grappler_test.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -62,7 +63,8 @@ class GrapplerTest : public ::testing::Test {
   Tensor GenerateRandomTensor(const TensorShape& shape) const {
     typedef typename EnumToDataType<DTYPE>::Type T;
     Tensor tensor(DTYPE, shape);
-    tensor.flat<T>() = tensor.flat<T>().random();
+    for (auto i = 0; i < tensor.NumElements(); i++)
+      tensor.flat<T>()(i) = i + random::New64() % 10;
     return tensor;
   }
 
-- 
GitLab


From bc78f9b060cece8e29a89f7dbcdedcadbc61891d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 14:32:07 -0700
Subject: [PATCH 251/450] internal END_PUBLIC

BEGIN_PUBLIC
Automated g4 rollback of changelist 193600682

PiperOrigin-RevId: 193723856
---
 .../layers/python/layers/rev_block_lib.py     |  77 ++-----------
 .../python/layers/rev_block_lib_test.py       | 102 ------------------
 2 files changed, 11 insertions(+), 168 deletions(-)

diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
index 9f904cc302..02d294c68f 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
@@ -45,7 +45,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
 
 __all__ = ["rev_block", "RevBlock", "recompute_grad"]
 
@@ -430,13 +429,12 @@ def enable_with_args(dec):
 
 
 @enable_with_args
-def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False,
-                   tensor_arg_names=None):
+def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
   """Decorator that recomputes the function on the backwards pass.
 
   Args:
-    fn: the subgraph-producing function to wrap and recompute when computing
-      gradients. Provide `tensor_arg_names` if not all arguments are `Tensor`s.
+    fn: a function that takes Tensors (all as positional arguments) and returns
+      a tuple of Tensors.
     use_data_dep: `bool`, if `True` will use a dummy data dependency to force
       the recompute to happen. If `False` will use a control dependency. By
       default will be `True` if in an XLA context and `False` otherwise. XLA
@@ -445,25 +443,17 @@ def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False,
       that all gradients are produced before any are consumed by downstream ops.
       If `use_data_dep` is also `True`, will use a data dependency instead of
       a control dependency.
-    tensor_arg_names: `list<str>`, names of the `Tensor` arguments to `fn`. If
-      `None`, assumes all arguments are `Tensor`s.
 
   Returns:
     A wrapped fn that is identical to fn when called, but its activations will
     be discarded and recomputed on the backwards pass (i.e. on a call to
     tf.gradients).
   """
-  if tensor_arg_names:
-    if not isinstance(tensor_arg_names, (list, tuple)):
-      raise TypeError("tensor_arg_names must be a list")
 
   @functools.wraps(fn)
-  def wrapped(*args, **kwargs):
-    tensor_only_fn, tensor_args = _make_tensor_only(fn, args, kwargs,
-                                                    tensor_arg_names)
+  def wrapped(*args):
     return _recompute_grad(
-        tensor_only_fn, tensor_args, use_data_dep=use_data_dep,
-        tupleize_grads=tupleize_grads)
+        fn, args, use_data_dep=use_data_dep, tupleize_grads=tupleize_grads)
 
   return wrapped
 
@@ -473,59 +463,11 @@ def _is_on_tpu():
   return control_flow_util.GetContainingXLAContext(ctxt) is not None
 
 
-def _make_tensor_only(fn, args, kwargs, tensor_arg_names):
-  """Return fn such that it only takes Tensor args for tensor_arg_names."""
-  argspec = tf_inspect.getargspec(fn)
-  if argspec.varargs is not None or argspec.keywords is not None:
-    raise ValueError("Function decorated with recompute_grad must not use "
-                     "*args or **kwargs.")
-  fn_arg_names = list(argspec.args)
-
-  # name_to_arg is a dict of argument name to argument value, including both
-  # positional and keyword arguments passed.
-  name_to_arg = {}
-  # Populate positional arguments.
-  for name, arg in zip(fn_arg_names[:len(args)], args):
-    name_to_arg[name] = arg
-  # Populate keyword arguments.
-  name_to_arg.update(kwargs)
-
-  # Separate the Tensor arguments from the non-Tensor arguments.
-  # The default is that all arguments are Tensor arguments.
-  tensor_arg_names = tensor_arg_names or fn_arg_names
-  for name in tensor_arg_names:
-    if name not in name_to_arg:
-      raise ValueError("Must provide Tensor argument %s" % name)
-  tensor_args = [name_to_arg[name] for name in tensor_arg_names]
-  non_tensor_kwargs = dict([(name, arg) for name, arg in name_to_arg.items()
-                            if name not in tensor_arg_names])
-
-  # Check that Tensor arguments are in fact Tensors and that non-Tensor
-  # arguments are not.
-  for name, arg in zip(tensor_arg_names, tensor_args):
-    if not isinstance(arg, framework_ops.Tensor):
-      raise TypeError("Fn argument %s must be a Tensor." % name)
-  for name, arg in non_tensor_kwargs.items():
-    if isinstance(arg, framework_ops.Tensor):
-      raise TypeError("Fn argument %s must not be a Tensor." % name)
-
-  # Construct a Tensor-only wrapper function that will pass the non-Tensor
-  # arguments as well when called.
-  def tensor_only_fn(*tensors):
-    all_kwargs = dict(zip(tensor_arg_names, tensors))
-    all_kwargs.update(non_tensor_kwargs)
-    return fn(**all_kwargs)
-
-  return tensor_only_fn, tensor_args
-
-
-def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT,
-                    tupleize_grads=False):
+def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
   """See recompute_grad."""
   for arg in args:
     if not isinstance(arg, framework_ops.Tensor):
       raise ValueError("All inputs to function must be Tensors")
-
   use_data_dep_ = use_data_dep
   if use_data_dep_ == _USE_DEFAULT:
     use_data_dep_ = _is_on_tpu()
@@ -559,11 +501,14 @@ def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT,
     grad_vars = grads[len(inputs):]
     return grad_inputs, grad_vars
 
-  # TODO(rsepassi): Replace with tf.custom_gradient
   @_fn_with_custom_grad(grad_fn)
   def fn_with_recompute(*args):
     cached_vs.append(variable_scope.get_variable_scope())
-    cached_arg_scope.append(contrib_framework_ops.current_arg_scope())
+    # TODO(rsepassi): Rm conditional in TF 1.4
+    if hasattr(contrib_framework_ops, "current_arg_scope"):
+      cached_arg_scope.append(contrib_framework_ops.current_arg_scope())
+    else:
+      cached_arg_scope.append({})
     return fn(*args)
 
   return fn_with_recompute(*args)
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
index 66ccc696f9..392a490be1 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
@@ -318,108 +318,6 @@ class RecomputeTest(test.TestCase):
       self.assertEqual(1, len(grads))
       self.assertTrue(grads[0] is not None)
 
-  def testWithNontensorArgs(self):
-    @rev_block_lib.recompute_grad(tupleize_grads=True,
-                                  tensor_arg_names=["inputs"])
-    def layer_with_recompute(inputs, plus=None):
-      var = variable_scope.get_variable("var", ())
-      self.assertFalse(plus)  # called with False below
-      if plus:
-        return var + inputs
-      else:
-        return var * inputs
-
-    inputs = array_ops.ones((), dtypes.float32)
-    outputs = layer_with_recompute(inputs, plus=False)
-    loss = math_ops.square(outputs)
-    grads = gradients_impl.gradients(loss, variables.trainable_variables())
-    self.assertEqual(1, len(grads))
-    self.assertTrue(grads[0] is not None)
-
-
-class MakeTensorOnlyTest(test.TestCase):
-
-  def testMakeTensorOnly(self):
-    def fn(a, b, c, d=1, e=None, f=7):
-      return (a, b, c, d, e, f)
-
-    t1 = array_ops.ones(())
-    t2 = array_ops.ones(())
-    t3 = array_ops.ones(())
-    args = [1, t1, 3, t2]
-    kwargs = {"e": t3}
-    tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only(
-        fn, args, kwargs, ["b", "d", "e"])
-    self.assertAllEqual(tensor_args, [t1, t2, t3])
-    out = tensor_only_fn(*tensor_args)
-    self.assertAllEqual(out, (1, t1, 3, t2, t3, 7))
-
-  def testMakeTensorOnlyPositionalArgsOnly(self):
-    def fn(a, b, c):
-      return (a, b, c)
-
-    t1 = array_ops.ones(())
-    t2 = array_ops.ones(())
-    args = [t1, 3, t2]
-    tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only(
-        fn, args, {}, ["a", "c"])
-    self.assertAllEqual(tensor_args, [t1, t2])
-    out = tensor_only_fn(*tensor_args)
-    self.assertAllEqual(out, (t1, 3, t2))
-
-  def testMakeTensorOnlyKwargsArgsOnly(self):
-    def fn(a=1, b=2, c=3):
-      return (a, b, c)
-
-    t1 = array_ops.ones(())
-    t2 = array_ops.ones(())
-    args = [t1]
-    kwargs = {"c": t2}
-    tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only(
-        fn, args, kwargs, ["a", "c"])
-    self.assertAllEqual(tensor_args, [t1, t2])
-    out = tensor_only_fn(*tensor_args)
-    self.assertAllEqual(out, (t1, 2, t2))
-
-  def testErrorOnMissingTensorArg(self):
-    def fn(a, b):
-      return (a, b)
-
-    with self.assertRaisesWithPredicateMatch(
-        ValueError, "provide Tensor argument"):
-      rev_block_lib._make_tensor_only(fn, [], {"b": 2}, ["a"])
-
-  def testErrorOnSignatureSplats(self):
-    def fn1(a, *args):
-      return (a, args)
-
-    err_msg = r"must not use \*args or \*\*kwargs"
-    with self.assertRaisesWithPredicateMatch(ValueError, err_msg):
-      rev_block_lib._make_tensor_only(fn1, [1, 2], {}, ["a"])
-
-    def fn2(a, **kwargs):
-      return (a, kwargs)
-
-    with self.assertRaisesWithPredicateMatch(ValueError, err_msg):
-      rev_block_lib._make_tensor_only(fn2, [], {"a": 1, "b": 2}, ["a"])
-
-  def testErrorOnNonTensorForTensor(self):
-    def fn(a, b):
-      return (a, b)
-
-    with self.assertRaisesWithPredicateMatch(TypeError, "must be a Tensor"):
-      rev_block_lib._make_tensor_only(fn, [2, 3], {}, ["a"])
-
-  def testErrorOnTensorForNonTensor(self):
-    def fn(a, b):
-      return (a, b)
-
-    with self.assertRaisesWithPredicateMatch(
-        TypeError, "must not be a Tensor"):
-      t1 = array_ops.ones(())
-      t2 = array_ops.ones(())
-      rev_block_lib._make_tensor_only(fn, [t1, t2], {}, ["a"])
-
 
 class FnWithCustomGradTest(test.TestCase):
 
-- 
GitLab


From b133f8c70622e52f19631fd93d4b87ee21c52ac6 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Fri, 20 Apr 2018 14:58:56 -0700
Subject: [PATCH 252/450] Move the guts of TFE_Execute into EagerExecute

PiperOrigin-RevId: 193728072
---
 tensorflow/c/eager/BUILD                      |   1 -
 tensorflow/c/eager/c_api.cc                   | 531 +-----------------
 tensorflow/core/common_runtime/eager/BUILD    |  21 +-
 .../core/common_runtime/eager/execute.cc      | 489 ++++++++++++++++
 .../core/common_runtime/eager/execute.h       |   7 +
 5 files changed, 508 insertions(+), 541 deletions(-)

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index d66386acbd..fae922ea3b 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -31,7 +31,6 @@ tf_cuda_library(
             "//tensorflow/core/common_runtime/eager:context",
             "//tensorflow/core/common_runtime/eager:eager_executor",
             "//tensorflow/core/common_runtime/eager:execute",
-            "//tensorflow/core/common_runtime/eager:execute_node",
             "//tensorflow/core/common_runtime/eager:kernel_and_device",
             "//tensorflow/core/common_runtime/eager:tensor_handle",
             "//tensorflow/core/common_runtime/eager:copy_to_device_node",
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index b7a3097208..975bde7c7f 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/eager/copy_to_device_node.h"
 #include "tensorflow/core/common_runtime/eager/execute.h"
-#include "tensorflow/core/common_runtime/eager/execute_node.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -219,9 +218,6 @@ TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) {
   }
   return retval;
 }
-}  // extern "C"
-
-extern "C" {
 
 TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
                   TF_Status* status) {
@@ -423,531 +419,18 @@ void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
       attr_name, tensorflow::gtl::ArraySlice<const tensorflow::NameAttrList>(
                      funcs.get(), num_values));
 }
-}  // extern "C"
-
-namespace {
-
-// Initializes the step stats if needed.
-void MaybeInitializeStepStats(tensorflow::StepStats* step_stats,
-                              tensorflow::EagerContext* ctx) {
-  // Lazily initialize the RunMetadata with information about all devices if
-  // this is the first call.
-  while (step_stats->dev_stats_size() < ctx->devices()->size()) {
-    int device_idx = step_stats->dev_stats_size();
-    auto* dev_stats = step_stats->add_dev_stats();
-    dev_stats->set_device(ctx->devices()->at(device_idx)->name());
-  }
-}
-
-int StepStatsDeviceIndex(tensorflow::StepStats* step_stats,
-                         tensorflow::EagerContext* ctx,
-                         tensorflow::Device* device) {
-  // Find the current device's index.
-  if (device == nullptr) {
-    device = ctx->HostCPU();
-  }
-  for (int i = 0; i < ctx->devices()->size(); ++i) {
-    if (ctx->devices()->at(i) == device ||
-        ctx->devices()->at(i)->name() == device->name()) {
-      return i;
-    }
-  }
-  // TODO(apassos) do not fall back to host CPU if device is unknown.
-  return 0;
-}
-
-tensorflow::Status ValidateInputTypeAndPlacement(
-    tensorflow::EagerContext* ctx, tensorflow::Device* op_device,
-    tensorflow::EagerOperation* op, const tensorflow::OpKernel* kernel,
-    tensorflow::RunMetadata* run_metadata) {
-  tensorflow::Device* host_device = ctx->HostCPU();
-  const tensorflow::MemoryTypeVector& memtypes = kernel->input_memory_types();
-  if (memtypes.size() != op->Inputs().size()) {
-    return tensorflow::errors::InvalidArgument(
-        "expected ", memtypes.size(), " inputs, got ", op->Inputs().size());
-  }
-  for (int i = 0; i < op->Inputs().size(); ++i) {
-    const tensorflow::Device* expected_device =
-        memtypes[i] == tensorflow::HOST_MEMORY ? host_device : op_device;
-    tensorflow::TensorHandle* handle = op->Inputs()[i];
-    tensorflow::Device* handle_device = nullptr;
-    TF_RETURN_IF_ERROR(handle->Device(&handle_device));
-    const tensorflow::Device* actual_device =
-        handle_device == nullptr ? host_device : handle_device;
-    if (expected_device != actual_device) {
-      switch (ctx->GetDevicePlacementPolicy()) {
-        case tensorflow::DEVICE_PLACEMENT_SILENT_FOR_INT32:
-          // TODO(xpan): See if we could bubble python related error up
-          // to python level.
-          if (handle->dtype == tensorflow::DT_INT32) {
-            // Note: enabling silent copies of int32 tensors to match behavior
-            // of graph mode.
-            break;
-          }
-          TF_FALLTHROUGH_INTENDED;
-        case tensorflow::DEVICE_PLACEMENT_EXPLICIT:
-          return tensorflow::errors::InvalidArgument(
-              "Tensors on conflicting devices:"
-              " cannot compute ",
-              op->Name(), " as input #", i, " was expected to be on ",
-              expected_device->name(), " but is actually on ",
-              actual_device->name(), " (operation running on ",
-              op_device->name(), ")",
-              " Tensors can be copied explicitly using .gpu() or .cpu() "
-              "methods,"
-              " or transparently copied by using tf.enable_eager_execution("
-              "device_policy=tfe.DEVICE_PLACEMENT_SILENT). Copying tensors "
-              "between devices"
-              " may slow down your model");
-        case tensorflow::DEVICE_PLACEMENT_WARN:
-          LOG(WARNING) << "before computing " << op->Name() << " input #" << i
-                       << " was expected to be on " << expected_device->name()
-                       << " but is actually on " << actual_device->name()
-                       << " (operation running on " << op_device->name()
-                       << "). This triggers a copy which can be a performance "
-                          "bottleneck.";
-          break;
-        case tensorflow::DEVICE_PLACEMENT_SILENT:  // Do nothing.
-          break;
-      }
-      // We are only here if the policy is warn or silent copies, so we should
-      // trigger a copy.
-      auto pre_time = tensorflow::Env::Default()->NowMicros();
-      tensorflow::TensorHandle* copied_tensor = nullptr;
-      tensorflow::Status status = tensorflow::EagerCopyToDevice(
-          handle, ctx, expected_device->name().c_str(), &copied_tensor);
-      if (run_metadata != nullptr) {
-        auto* step_stats = run_metadata->mutable_step_stats();
-        MaybeInitializeStepStats(step_stats, ctx);
-        // Record the sending on the source device for now.
-        int device_idx = StepStatsDeviceIndex(step_stats, ctx, handle_device);
-        auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
-        auto* node_stats = dev_stats->add_node_stats();
-        node_stats->set_node_name("_Send");
-        node_stats->set_all_start_micros(pre_time);
-        node_stats->set_op_end_rel_micros(
-            tensorflow::Env::Default()->NowMicros() - pre_time);
-      }
-      if (!status.ok()) {
-        if (copied_tensor != nullptr) copied_tensor->Unref();
-        return tensorflow::errors::Internal(
-            "Failed copying input tensor from ", actual_device->name(), " to ",
-            expected_device->name(), " in order to run ", op->Name(), ": ",
-            status.error_message());
-      }
-      handle->Unref();
-      handle = copied_tensor;
-      (*op->MutableInputs())[i] = copied_tensor;
-    }
-    if (handle->dtype != kernel->input_type(i)) {
-      return tensorflow::errors::InvalidArgument(
-          "cannot compute ", op->Name(), " as input #", i,
-          " was expected to be a ",
-          tensorflow::DataTypeString(kernel->input_type(i)),
-          " tensor but is a ", tensorflow::DataTypeString(handle->dtype),
-          " tensor");
-    }
-  }
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Device* SelectDevice(const tensorflow::NodeDef& ndef,
-                                 tensorflow::EagerContext* ctx,
-                                 TF_Status* status) {
-  tensorflow::DeviceSet ds;
-  for (tensorflow::Device* d : *ctx->devices()) {
-    ds.AddDevice(d);
-  }
-  tensorflow::DeviceTypeVector final_devices;
-  status->status = tensorflow::SupportedDeviceTypesForNode(
-      ds.PrioritizedDeviceTypeList(), ndef, &final_devices);
-  if (!status->status.ok()) {
-    return nullptr;
-  }
-  if (final_devices.empty()) {
-    status->status = tensorflow::errors::Internal(
-        "Could not find valid device for node ", ndef.DebugString());
-    return nullptr;
-  }
-  for (tensorflow::Device* d : *ctx->devices()) {
-    if (d->device_type() == final_devices[0].type_string()) {
-      return d;
-    }
-  }
-  status->status = tensorflow::errors::Unknown(
-      "Could not find a device for node ", ndef.DebugString());
-  return nullptr;
-}
-
-#ifdef TENSORFLOW_EAGER_USE_XLA
-// Synthesizes and returns a wrapper function over `op`, which must be a
-// primitive op (e.g. matmul).
-//
-// The wrapper function conforms to the function signature expected by
-// _XlaLaunchOp, with input params ordered by <constants, (variable) args and
-// resources>. For example, if the op has input params <Const1, Arg2, Const3,
-// Resource4, Arg5>, they will be reordered to <Const1, Const3, Arg2, Arg5,
-// Resource4> as the input params to the synthesized function.
-//
-// It populates `const_input_types`, `arg_input_types` and
-// `op_input_to_func_input` based on the reordering results, that the caller can
-// use them to build an _XlaLaunchOp. On error, it returns NULL, and sets
-// `status` accordingly.
-const tensorflow::FunctionDef* OpToFunction(
-    TFE_Op* op, std::vector<TF_DataType>* const_input_types,
-    std::vector<TF_DataType>* arg_input_types,
-    tensorflow::gtl::FlatMap<int, int>* op_input_to_func_input,
-    TF_Status* status) {
-  DCHECK(!op->operation.is_function());
-
-  tensorflow::FunctionDef fdef;
-
-  // Get the OpDef of the op we are trying to encapsulate.
-  TFE_Context* ctx = op->operation.ctx;
-  const tensorflow::OpRegistrationData* op_data;
-  {
-    status->status =
-        ctx->context.FindFunctionOpData(op->operation.Name(), &op_data);
-    if (!status->status.ok()) {
-      return nullptr;
-    }
-  }
-  const tensorflow::OpDef& op_def = op_data->op_def;
-
-  tensorflow::OpDef* signature = fdef.mutable_signature();
-
-  // Handle constant inputs.
-  const std::unordered_set<string> const_inputs(
-      *tensorflow::XlaOpRegistry::CompileTimeConstantInputs(
-          op->operation.Name()));
-
-  // First add place holders for the input args, so that we can refer to them by
-  // position in the next loop. Also tally up the resource inputs.
-  int num_resource_inputs = 0;
-  for (int i = 0; i < op_def.input_arg_size(); ++i) {
-    if (op_def.input_arg(i).type() == tensorflow::DT_RESOURCE) {
-      ++num_resource_inputs;
-    }
-    signature->add_input_arg();
-  }
-
-  // Now we map the input params from `op_def` to `signature`, where the param
-  // ordering for `signature` is: <constants, args, resources>.
-  int const_index = 0;
-  int arg_index = const_inputs.size();
-  int resource_index = op_def.input_arg_size() - num_resource_inputs;
-  for (int i = 0; i < op_def.input_arg_size(); ++i) {
-    const tensorflow::OpDef::ArgDef& op_input_arg = op_def.input_arg(i);
-    tensorflow::OpDef::ArgDef* func_input_arg = nullptr;
-    if (const_inputs.find(op_input_arg.name()) != const_inputs.end()) {
-      VLOG(1) << "For const input, mapping op input " << i << " to func input "
-              << const_index;
-      (*op_input_to_func_input)[i] = const_index;
-      func_input_arg = signature->mutable_input_arg(const_index++);
-      const_input_types->push_back(
-          static_cast<TF_DataType>(op->operation.Inputs()[i]->dtype));
-    } else if (op_input_arg.type() == tensorflow::DT_RESOURCE) {
-      VLOG(1) << "For resource input, mapping op input " << i
-              << " to func input " << resource_index;
-      (*op_input_to_func_input)[i] = resource_index;
-      func_input_arg = signature->mutable_input_arg(resource_index++);
-    } else {
-      VLOG(1) << "For arg input, mapping op input " << i << " to func input "
-              << arg_index;
-      (*op_input_to_func_input)[i] = arg_index;
-      func_input_arg = signature->mutable_input_arg(arg_index++);
-      arg_input_types->push_back(
-          static_cast<TF_DataType>(op->operation.Inputs()[i]->dtype));
-    }
-
-    func_input_arg->set_name(op_input_arg.name());
-    func_input_arg->set_type(op->operation.Inputs()[i]->dtype);
-  }
-  VLOG(1) << "Added OpDef Inputs: " << fdef.DebugString();
-
-  // Resources args are at the end of the function input params, and we should
-  // have iterated over all of them.
-  DCHECK_EQ(signature->input_arg_size(), resource_index);
-
-  // Make the synthesized function's name unique.
-  signature->set_name(tensorflow::strings::StrCat(
-      op_def.name(), func_id_generator.fetch_add(1)));
-
-  // Add the node def and set its input names to match op_def's names.
-  const tensorflow::NodeDef& ndef =
-      op->operation.MutableAttrs()->BuildNodeDef();
-  DCHECK_EQ(signature->input_arg_size(), ndef.input_size());
-  *fdef.add_node_def() = ndef;
-  for (int i = 0; i < op_def.input_arg_size(); ++i) {
-    fdef.mutable_node_def(0)->set_input(i, op_def.input_arg(i).name());
-  }
-  VLOG(1) << "Added NodeDef: " << fdef.DebugString();
-
-  // Fix the output names and set output types.
-  for (int i = 0; i < op_def.output_arg_size(); ++i) {
-    tensorflow::OpDef::ArgDef* arg = signature->add_output_arg();
-    const tensorflow::OpDef::ArgDef& op_def_arg = op_def.output_arg(i);
-    const string& out_tensor_name = tensorflow::strings::StrCat(
-        ndef.name(), ":", op_def_arg.name(), ":", 0);
-    arg->set_name(op_def_arg.name());
-    (*fdef.mutable_ret())[op_def_arg.name()] = out_tensor_name;
-    const string& type_attr = op_def_arg.type_attr();
-    if (!type_attr.empty()) {
-      auto i = ndef.attr().find(type_attr);
-      if (i == ndef.attr().end()) {
-        status->status = tensorflow::errors::InvalidArgument(
-            tensorflow::strings::StrCat("Could not find attr ", type_attr,
-                                        " in NodeDef ", ndef.DebugString()));
-        return nullptr;
-      }
-      arg->set_type(i->second.type());
-    }
-  }
-  VLOG(1) << "Fixed Output names and all types: " << fdef.DebugString();
-
-  status->status = ctx->context.AddFunctionDef(fdef);
-  if (!status->status.ok()) return nullptr;
-  const auto ret = ctx->context.FindFunctionDef(signature->name());
-  DCHECK(ret != nullptr);
-  return ret;
-}
-
-// Builds an _XLALaunchOp as a wrapper over 'op', so that 'op' can be executed
-// via XLA.
-std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
-  VLOG(1) << "Creating _XlaLaunchOp for TFE_Op " << op->operation.Name();
-  auto launch_op = std::unique_ptr<TFE_Op>(
-      TFE_NewOp(op->operation.ctx, "_XlaLaunch", status));
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  if (op->operation.device) {
-    TFE_OpSetDevice(launch_op.get(), op->operation.device->name().c_str(),
-                    status);
-    if (TF_GetCode(status) != TF_OK) return nullptr;
-  }
-
-  const tensorflow::FunctionDef* fdef;
-  { fdef = op->operation.ctx->FindFunctionDef(op->operation.Name()); }
-  std::vector<TF_DataType> const_input_types;
-  std::vector<TF_DataType> arg_input_types;
-  tensorflow::gtl::FlatMap<int, int> op_input_to_func_input;
-  if (fdef == nullptr) {
-    // See if this is a primitive op, and if so create a function for it, so
-    // that _XlaLaunchOp can access it.
-    fdef = OpToFunction(op, &const_input_types, &arg_input_types,
-                        &op_input_to_func_input, status);
-    if (!status->status.ok()) return nullptr;
-  } else {
-    // TODO(hongm): XlaOpRegistry::CompileTimeConstantInputs() does not work for
-    // functions, so we need to find another way to handle constant inputs.
-    for (int i = const_input_types.size();
-         i < fdef->signature().input_arg_size(); ++i) {
-      VLOG(1) << "Adding Targs from input arg " << i;
-      const tensorflow::OpDef::ArgDef& arg = fdef->signature().input_arg(i);
-      arg_input_types.push_back(static_cast<TF_DataType>(arg.type()));
-    }
-  }
-  DCHECK(fdef != nullptr);
-
-  // Copy inputs and their devices.
-  // Since input param reordering may have occurred between `op` and `launch_op`
-  // via `op_input_to_func_input`, adjust the actual inputs accordingly.
-  *launch_op->operation.MutableInputs() = op->operation.Inputs();
-  for (tensorflow::TensorHandle* h : launch_op->operation.Inputs()) {
-    h->Ref();
-  }
-  if (!op_input_to_func_input.empty()) {
-    DCHECK_EQ(op->operation.Inputs().size(), op_input_to_func_input.size());
-    for (int i = 0; i < op_input_to_func_input.size(); ++i) {
-      VLOG(1) << "mapping op input " << i << " to func input "
-              << op_input_to_func_input[i];
-
-      (*launch_op->operation.MuableInputs())[op_input_to_func_input[i]] =
-          op->operation.Inputs()[i];
-    }
-  }
-  launch_op->operation.MutableAttrs()->NumInputs(op->operation.Inputs().size());
-
-  TFE_OpSetAttrTypeList(launch_op.get(), "Tconstants", const_input_types.data(),
-                        const_input_types.size());
-
-  // Set Targs and Nresources attrs.
-  TFE_OpSetAttrTypeList(launch_op.get(), "Targs", arg_input_types.data(),
-                        arg_input_types.size());
-  const int num_resource_inputs = fdef->signature().input_arg_size() -
-                                  const_input_types.size() -
-                                  arg_input_types.size();
-  TFE_OpSetAttrInt(launch_op.get(), "Nresources", num_resource_inputs);
-
-  // Set Tresults attr.
-  std::vector<TF_DataType> tresults;
-  for (const tensorflow::OpDef::ArgDef& arg : fdef->signature().output_arg()) {
-    tresults.push_back(static_cast<TF_DataType>(arg.type()));
-  }
-  TFE_OpSetAttrTypeList(launch_op.get(), "Tresults", tresults.data(),
-                        tresults.size());
-
-  // Set function attr.
-  tensorflow::AttrValue attr_value;
-  tensorflow::NameAttrList* func = attr_value.mutable_func();
-  func->set_name(fdef->signature().name());
-  launch_op->attrs.Set("function", attr_value);
-
-  return launch_op;
-}
-#endif  // TENSORFLOW_EAGER_USE_XLA
 
-}  // namespace
-
-extern "C" {
-
-void TFE_Execute(TFE_Op* tfe_op, TFE_TensorHandle** retvals, int* num_retvals,
+void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
                  TF_Status* status) {
-  tensorflow::EagerOperation* op = &tfe_op->operation;
-  tensorflow::EagerContext* ctx = op->EagerContext();
-  status->status = ctx->GetStatus();
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> handle_retvals(
+      *num_retvals);
+  status->status =
+      tensorflow::EagerExecute(&op->operation, &handle_retvals, num_retvals);
   if (!status->status.ok()) {
     return;
   }
-#ifdef TENSORFLOW_EAGER_USE_XLA
-  std::unique_ptr<TFE_Op> xla_launch_op;
-  if (op->UseXla() && op->Name() != "_XlaLaunch") {
-    xla_launch_op = BuildXlaLaunch(op, status);
-    if (!status->status.ok()) {
-      return;
-    }
-    op = xla_launch_op.get();
-  }
-#endif  // TENSORFLOW_EAGER_USE_XLA
-  // Ensure all resource-touching ops run in the device the resource is,
-  // regardless of anything else that has been specified. This is identical to
-  // the graph mode behavior.
-  for (int i = 0; i < op->Inputs().size(); ++i) {
-    tensorflow::Device* input_op_device = nullptr;
-    status->status = op->Inputs()[i]->OpDevice(&input_op_device);
-    if (!status->status.ok()) return;
-    VLOG(2) << "for op " << op->Name() << " input " << i << " "
-            << tensorflow::DataTypeString(op->Inputs()[i]->dtype) << " "
-            << (input_op_device == nullptr ? "cpu" : input_op_device->name())
-            << " " << (op->Device() == nullptr ? "cpu" : op->Device()->name());
-    if (op->Inputs()[i]->dtype == tensorflow::DT_RESOURCE &&
-        (input_op_device != op->Device() || input_op_device == nullptr)) {
-      tensorflow::Device* d =
-          input_op_device == nullptr ? ctx->HostCPU() : input_op_device;
-      VLOG(1) << "Changing device of operation " << op->Name() << " to "
-              << d->name() << " because input #" << i
-              << " is a resource in this device.";
-      op->SetDevice(d);
-    }
-  }
-  tensorflow::Device* device = op->Device();
-
-  tensorflow::Fprint128 cache_key = op->MutableAttrs()->CacheKey(
-      device == nullptr ? "unspecified" : device->name());
-  tensorflow::KernelAndDevice* kernel = ctx->GetCachedKernel(cache_key);
-  if (kernel == nullptr) {
-    const tensorflow::NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
-    if (device == nullptr) {
-      device = SelectDevice(ndef, ctx, status);
-      if (!status->status.ok()) {
-        return;
-      }
-    }
-    CHECK(device != nullptr);
-    if (ctx->LogDevicePlacement()) {
-      LOG(INFO) << "Executing op " << ndef.op() << " in device "
-                << device->name();
-    }
-    kernel = new tensorflow::KernelAndDevice(ctx->GetRendezvous());
-    // Knowledge of the implementation of Init (and in-turn
-    // FunctionLibraryRuntime::CreateKernel) tells us that ctx->func_lib_def
-    // will be accessed, so grab on to the lock.
-    // See WARNING comment in Execute (before kernel->Run) - would be nice to
-    // rework to avoid this subtlety.
-    tensorflow::tf_shared_lock l(*ctx->FunctionsMu());
-    status->status =
-        tensorflow::KernelAndDevice::Init(ndef, ctx->func_lib(device), kernel);
-    if (!status->status.ok()) {
-      delete kernel;
-      return;
-    }
-    // Update output_dtypes inside `kernel`.
-    const tensorflow::OpDef* op_def = nullptr;
-    const tensorflow::FunctionDef* function_def =
-        ctx->FuncLibDef()->Find(ndef.op());
-    if (function_def != nullptr) {
-      op_def = &(function_def->signature());
-    }
-    if (op_def == nullptr) {
-      status->status = OpDefForOp(ndef.op().c_str(), &op_def);
-      if (!status->status.ok()) {
-        return;
-      }
-    }
-    tensorflow::DataTypeVector input_dtypes;
-    status->status = InOutTypesForNode(ndef, *op_def, &input_dtypes,
-                                       kernel->mutable_output_dtypes());
-    if (!status->status.ok()) {
-      return;
-    }
-    ctx->AddKernelToCache(cache_key, kernel);
-  }
-  const tensorflow::DataTypeVector& output_dtypes = kernel->output_dtypes();
-  const int output_dtypes_size = output_dtypes.size();
-  if (output_dtypes_size > *num_retvals) {
-    TF_SetStatus(status, TF_INVALID_ARGUMENT,
-                 tensorflow::strings::StrCat("Expecting ", output_dtypes.size(),
-                                             " outputs, but *num_retvals is ",
-                                             *num_retvals)
-                     .c_str());
-    return;
-  }
-  *num_retvals = output_dtypes_size;
-  if (device == nullptr) {
-    // TODO(apassos) debug how the assignment below might return a different
-    // device from the one requested above.
-    device = kernel->device();
-  }
-  status->status = ValidateInputTypeAndPlacement(
-      ctx, device, op, kernel->kernel(),
-      ctx->ShouldStoreMetadata() ? ctx->RunMetadataProto() : nullptr);
-  if (!status->status.ok()) return;
-  std::unique_ptr<tensorflow::NodeExecStats> maybe_stats;
-  if (ctx->ShouldStoreMetadata()) {
-    maybe_stats.reset(new tensorflow::NodeExecStats);
-    maybe_stats->set_node_name(op->Name());
-    maybe_stats->set_all_start_micros(tensorflow::Env::Default()->NowMicros());
-    maybe_stats->set_op_start_rel_micros(0);
-    maybe_stats->set_scheduled_micros(tensorflow::Env::Default()->NowMicros());
-    // TODO(apassos) track referenced tensors
-  }
-  if (ctx->Async()) {
-    // Note that for async mode, execution order will make sure that all
-    // input handles are ready before executing them.
-    // TODO(agarwal): Consider executing "cheap" kernels inline for performance.
-    tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> handle_retvals(
-        *num_retvals);
-    tensorflow::uint64 id = ctx->NextId();
-    for (int i = 0; i < *num_retvals; ++i) {
-      tensorflow::TensorHandle* h =
-          new tensorflow::TensorHandle(id, output_dtypes[i], ctx);
-      retvals[i] = new TFE_TensorHandle(h);
-      handle_retvals[i] = h;
-    }
-    tensorflow::EagerNode* node = new tensorflow::ExecuteNode(
-        id, ctx, op->Device(), op->Inputs(), kernel, maybe_stats.release(),
-        output_dtypes, handle_retvals);
-    ctx->ExecutorAdd(node);
-  } else {
-    // Execute checks if retvals[i] is nullptr or not to figure if it needs to
-    // allocate it.
-    tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> handle_retvals(
-        *num_retvals);
-    status->status = tensorflow::EagerExecute(
-        ctx, op->Device(), op->Inputs(), kernel, maybe_stats.get(),
-        handle_retvals.data(), *num_retvals);
-    for (int i = 0; i < *num_retvals; ++i) {
-      retvals[i] = new TFE_TensorHandle(handle_retvals[i]);
-    }
+  for (int i = 0; i < *num_retvals; ++i) {
+    retvals[i] = new TFE_TensorHandle(handle_retvals[i]);
   }
 }
 
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 00ac4a4e47..13d6b021b5 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -154,26 +154,15 @@ tf_cc_test(
 cc_library(
     name = "execute",
     srcs = ["execute.cc"],
-    hdrs = ["execute.h"],
-    deps = [
-        ":context",
-        ":copy_to_device_node",
-        ":kernel_and_device",
-        ":tensor_handle",
-        "//tensorflow/core:core_cpu_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
+    hdrs = [
+        "execute.h",
+        "execute_node.h",
     ],
-)
-
-cc_library(
-    name = "execute_node",
-    hdrs = ["execute_node.h"],
     deps = [
         ":context",
+        ":copy_to_device_node",
         ":eager_executor",
-        ":execute",
+        ":eager_operation",
         ":kernel_and_device",
         ":tensor_handle",
         "//tensorflow/core:core_cpu_lib",
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 98e8471102..a514f81e14 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -18,8 +18,10 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/copy_to_device_node.h"
+#include "tensorflow/core/common_runtime/eager/execute_node.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
@@ -32,6 +34,493 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+
+// Initializes the step stats if needed.
+void MaybeInitializeStepStats(StepStats* step_stats, EagerContext* ctx) {
+  // Lazily initialize the RunMetadata with information about all devices if
+  // this is the first call.
+  while (step_stats->dev_stats_size() < ctx->devices()->size()) {
+    int device_idx = step_stats->dev_stats_size();
+    auto* dev_stats = step_stats->add_dev_stats();
+    dev_stats->set_device(ctx->devices()->at(device_idx)->name());
+  }
+}
+
+int StepStatsDeviceIndex(StepStats* step_stats, EagerContext* ctx,
+                         Device* device) {
+  // Find the current device's index.
+  if (device == nullptr) {
+    device = ctx->HostCPU();
+  }
+  for (int i = 0; i < ctx->devices()->size(); ++i) {
+    if (ctx->devices()->at(i) == device ||
+        ctx->devices()->at(i)->name() == device->name()) {
+      return i;
+    }
+  }
+  // TODO(apassos) do not fall back to host CPU if device is unknown.
+  return 0;
+}
+
+Status ValidateInputTypeAndPlacement(EagerContext* ctx, Device* op_device,
+                                     EagerOperation* op, const OpKernel* kernel,
+                                     RunMetadata* run_metadata) {
+  Device* host_device = ctx->HostCPU();
+  const MemoryTypeVector& memtypes = kernel->input_memory_types();
+  if (memtypes.size() != op->Inputs().size()) {
+    return errors::InvalidArgument("expected ", memtypes.size(),
+                                   " inputs, got ", op->Inputs().size());
+  }
+  for (int i = 0; i < op->Inputs().size(); ++i) {
+    const Device* expected_device =
+        memtypes[i] == HOST_MEMORY ? host_device : op_device;
+    TensorHandle* handle = op->Inputs()[i];
+    Device* handle_device = nullptr;
+    TF_RETURN_IF_ERROR(handle->Device(&handle_device));
+    const Device* actual_device =
+        handle_device == nullptr ? host_device : handle_device;
+    if (expected_device != actual_device) {
+      switch (ctx->GetDevicePlacementPolicy()) {
+        case DEVICE_PLACEMENT_SILENT_FOR_INT32:
+          // TODO(xpan): See if we could bubble python related error up
+          // to python level.
+          if (handle->dtype == DT_INT32) {
+            // Note: enabling silent copies of int32 tensors to match behavior
+            // of graph mode.
+            break;
+          }
+          TF_FALLTHROUGH_INTENDED;
+        case DEVICE_PLACEMENT_EXPLICIT:
+          return errors::InvalidArgument(
+              "Tensors on conflicting devices:"
+              " cannot compute ",
+              op->Name(), " as input #", i, " was expected to be on ",
+              expected_device->name(), " but is actually on ",
+              actual_device->name(), " (operation running on ",
+              op_device->name(), ")",
+              " Tensors can be copied explicitly using .gpu() or .cpu() "
+              "methods,"
+              " or transparently copied by using tf.enable_eager_execution("
+              "device_policy=tfe.DEVICE_PLACEMENT_SILENT). Copying tensors "
+              "between devices"
+              " may slow down your model");
+        case DEVICE_PLACEMENT_WARN:
+          LOG(WARNING) << "before computing " << op->Name() << " input #" << i
+                       << " was expected to be on " << expected_device->name()
+                       << " but is actually on " << actual_device->name()
+                       << " (operation running on " << op_device->name()
+                       << "). This triggers a copy which can be a performance "
+                          "bottleneck.";
+          break;
+        case DEVICE_PLACEMENT_SILENT:  // Do nothing.
+          break;
+      }
+      // We are only here if the policy is warn or silent copies, so we should
+      // trigger a copy.
+      auto pre_time = Env::Default()->NowMicros();
+      TensorHandle* copied_tensor = nullptr;
+      Status status = EagerCopyToDevice(
+          handle, ctx, expected_device->name().c_str(), &copied_tensor);
+      if (run_metadata != nullptr) {
+        auto* step_stats = run_metadata->mutable_step_stats();
+        MaybeInitializeStepStats(step_stats, ctx);
+        // Record the sending on the source device for now.
+        int device_idx = StepStatsDeviceIndex(step_stats, ctx, handle_device);
+        auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
+        auto* node_stats = dev_stats->add_node_stats();
+        node_stats->set_node_name("_Send");
+        node_stats->set_all_start_micros(pre_time);
+        node_stats->set_op_end_rel_micros(Env::Default()->NowMicros() -
+                                          pre_time);
+      }
+      if (!status.ok()) {
+        if (copied_tensor != nullptr) copied_tensor->Unref();
+        return errors::Internal("Failed copying input tensor from ",
+                                actual_device->name(), " to ",
+                                expected_device->name(), " in order to run ",
+                                op->Name(), ": ", status.error_message());
+      }
+      handle->Unref();
+      handle = copied_tensor;
+      (*op->MutableInputs())[i] = copied_tensor;
+    }
+    if (handle->dtype != kernel->input_type(i)) {
+      return errors::InvalidArgument(
+          "cannot compute ", op->Name(), " as input #", i,
+          " was expected to be a ", DataTypeString(kernel->input_type(i)),
+          " tensor but is a ", DataTypeString(handle->dtype), " tensor");
+    }
+  }
+  return Status::OK();
+}
+
+Status SelectDevice(const NodeDef& ndef, EagerContext* ctx, Device** device) {
+  DeviceSet ds;
+  for (Device* d : *ctx->devices()) {
+    ds.AddDevice(d);
+  }
+  DeviceTypeVector final_devices;
+  auto status = SupportedDeviceTypesForNode(ds.PrioritizedDeviceTypeList(),
+                                            ndef, &final_devices);
+  if (!status.ok()) return status;
+  if (final_devices.empty()) {
+    return errors::Internal("Could not find valid device for node ",
+                            ndef.DebugString());
+  }
+  for (Device* d : *ctx->devices()) {
+    if (d->device_type() == final_devices[0].type_string()) {
+      *device = d;
+      return Status::OK();
+    }
+  }
+  return errors::Unknown("Could not find a device for node ",
+                         ndef.DebugString());
+}
+
+#ifdef TENSORFLOW_EAGER_USE_XLA
+// Synthesizes and returns a wrapper function over `op`, which must be a
+// primitive op (e.g. matmul).
+//
+// The wrapper function conforms to the function signature expected by
+// _XlaLaunchOp, with input params ordered by <constants, (variable) args and
+// resources>. For example, if the op has input params <Const1, Arg2, Const3,
+// Resource4, Arg5>, they will be reordered to <Const1, Const3, Arg2, Arg5,
+// Resource4> as the input params to the synthesized function.
+//
+// It populates `const_input_types`, `arg_input_types` and
+// `op_input_to_func_input` based on the reordering results, that the caller can
+// use them to build an _XlaLaunchOp. On error, it returns NULL, and sets
+// `status` accordingly.
+const FunctionDef* OpToFunction(TFE_Op* op,
+                                std::vector<TF_DataType>* const_input_types,
+                                std::vector<TF_DataType>* arg_input_types,
+                                gtl::FlatMap<int, int>* op_input_to_func_input,
+                                TF_Status* status) {
+  DCHECK(!op->operation.is_function());
+
+  FunctionDef fdef;
+
+  // Get the OpDef of the op we are trying to encapsulate.
+  TFE_Context* ctx = op->operation.ctx;
+  const OpRegistrationData* op_data;
+  {
+    status = ctx->context.FindFunctionOpData(op->operation.Name(), &op_data);
+    if (!status.ok()) {
+      return nullptr;
+    }
+  }
+  const OpDef& op_def = op_data->op_def;
+
+  OpDef* signature = fdef.mutable_signature();
+
+  // Handle constant inputs.
+  const std::unordered_set<string> const_inputs(
+      *XlaOpRegistry::CompileTimeConstantInputs(op->operation.Name()));
+
+  // First add place holders for the input args, so that we can refer to them by
+  // position in the next loop. Also tally up the resource inputs.
+  int num_resource_inputs = 0;
+  for (int i = 0; i < op_def.input_arg_size(); ++i) {
+    if (op_def.input_arg(i).type() == DT_RESOURCE) {
+      ++num_resource_inputs;
+    }
+    signature->add_input_arg();
+  }
+
+  // Now we map the input params from `op_def` to `signature`, where the param
+  // ordering for `signature` is: <constants, args, resources>.
+  int const_index = 0;
+  int arg_index = const_inputs.size();
+  int resource_index = op_def.input_arg_size() - num_resource_inputs;
+  for (int i = 0; i < op_def.input_arg_size(); ++i) {
+    const OpDef::ArgDef& op_input_arg = op_def.input_arg(i);
+    OpDef::ArgDef* func_input_arg = nullptr;
+    if (const_inputs.find(op_input_arg.name()) != const_inputs.end()) {
+      VLOG(1) << "For const input, mapping op input " << i << " to func input "
+              << const_index;
+      (*op_input_to_func_input)[i] = const_index;
+      func_input_arg = signature->mutable_input_arg(const_index++);
+      const_input_types->push_back(
+          static_cast<TF_DataType>(op->operation.Inputs()[i]->dtype));
+    } else if (op_input_arg.type() == DT_RESOURCE) {
+      VLOG(1) << "For resource input, mapping op input " << i
+              << " to func input " << resource_index;
+      (*op_input_to_func_input)[i] = resource_index;
+      func_input_arg = signature->mutable_input_arg(resource_index++);
+    } else {
+      VLOG(1) << "For arg input, mapping op input " << i << " to func input "
+              << arg_index;
+      (*op_input_to_func_input)[i] = arg_index;
+      func_input_arg = signature->mutable_input_arg(arg_index++);
+      arg_input_types->push_back(
+          static_cast<TF_DataType>(op->operation.Inputs()[i]->dtype));
+    }
+
+    func_input_arg->set_name(op_input_arg.name());
+    func_input_arg->set_type(op->operation.Inputs()[i]->dtype);
+  }
+  VLOG(1) << "Added OpDef Inputs: " << fdef.DebugString();
+
+  // Resources args are at the end of the function input params, and we should
+  // have iterated over all of them.
+  DCHECK_EQ(signature->input_arg_size(), resource_index);
+
+  // Make the synthesized function's name unique.
+  signature->set_name(
+      strings::StrCat(op_def.name(), func_id_generator.fetch_add(1)));
+
+  // Add the node def and set its input names to match op_def's names.
+  const NodeDef& ndef = op->operation.MutableAttrs()->BuildNodeDef();
+  DCHECK_EQ(signature->input_arg_size(), ndef.input_size());
+  *fdef.add_node_def() = ndef;
+  for (int i = 0; i < op_def.input_arg_size(); ++i) {
+    fdef.mutable_node_def(0)->set_input(i, op_def.input_arg(i).name());
+  }
+  VLOG(1) << "Added NodeDef: " << fdef.DebugString();
+
+  // Fix the output names and set output types.
+  for (int i = 0; i < op_def.output_arg_size(); ++i) {
+    OpDef::ArgDef* arg = signature->add_output_arg();
+    const OpDef::ArgDef& op_def_arg = op_def.output_arg(i);
+    const string& out_tensor_name =
+        strings::StrCat(ndef.name(), ":", op_def_arg.name(), ":", 0);
+    arg->set_name(op_def_arg.name());
+    (*fdef.mutable_ret())[op_def_arg.name()] = out_tensor_name;
+    const string& type_attr = op_def_arg.type_attr();
+    if (!type_attr.empty()) {
+      auto i = ndef.attr().find(type_attr);
+      if (i == ndef.attr().end()) {
+        status = errors::InvalidArgument(
+            strings::StrCat("Could not find attr ", type_attr, " in NodeDef ",
+                            ndef.DebugString()));
+        return nullptr;
+      }
+      arg->set_type(i->second.type());
+    }
+  }
+  VLOG(1) << "Fixed Output names and all types: " << fdef.DebugString();
+
+  status = ctx->context.AddFunctionDef(fdef);
+  if (!status.ok()) return nullptr;
+  const auto ret = ctx->context.FindFunctionDef(signature->name());
+  DCHECK(ret != nullptr);
+  return ret;
+}
+
+// Builds an _XLALaunchOp as a wrapper over 'op', so that 'op' can be executed
+// via XLA.
+std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
+  VLOG(1) << "Creating _XlaLaunchOp for TFE_Op " << op->operation.Name();
+  auto launch_op = std::unique_ptr<TFE_Op>(
+      TFE_NewOp(op->operation.ctx, "_XlaLaunch", status));
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  if (op->operation.device) {
+    TFE_OpSetDevice(launch_op.get(), op->operation.device->name().c_str(),
+                    status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+  }
+
+  const FunctionDef* fdef;
+  { fdef = op->operation.ctx->FindFunctionDef(op->operation.Name()); }
+  std::vector<TF_DataType> const_input_types;
+  std::vector<TF_DataType> arg_input_types;
+  gtl::FlatMap<int, int> op_input_to_func_input;
+  if (fdef == nullptr) {
+    // See if this is a primitive op, and if so create a function for it, so
+    // that _XlaLaunchOp can access it.
+    fdef = OpToFunction(op, &const_input_types, &arg_input_types,
+                        &op_input_to_func_input, status);
+    if (!status.ok()) return nullptr;
+  } else {
+    // TODO(hongm): XlaOpRegistry::CompileTimeConstantInputs() does not work for
+    // functions, so we need to find another way to handle constant inputs.
+    for (int i = const_input_types.size();
+         i < fdef->signature().input_arg_size(); ++i) {
+      VLOG(1) << "Adding Targs from input arg " << i;
+      const OpDef::ArgDef& arg = fdef->signature().input_arg(i);
+      arg_input_types.push_back(static_cast<TF_DataType>(arg.type()));
+    }
+  }
+  DCHECK(fdef != nullptr);
+
+  // Copy inputs and their devices.
+  // Since input param reordering may have occurred between `op` and `launch_op`
+  // via `op_input_to_func_input`, adjust the actual inputs accordingly.
+  *launch_op->operation.MutableInputs() = op->operation.Inputs();
+  for (TensorHandle* h : launch_op->operation.Inputs()) {
+    h->Ref();
+  }
+  if (!op_input_to_func_input.empty()) {
+    DCHECK_EQ(op->operation.Inputs().size(), op_input_to_func_input.size());
+    for (int i = 0; i < op_input_to_func_input.size(); ++i) {
+      VLOG(1) << "mapping op input " << i << " to func input "
+              << op_input_to_func_input[i];
+
+      (*launch_op->operation.MuableInputs())[op_input_to_func_input[i]] =
+          op->operation.Inputs()[i];
+    }
+  }
+  launch_op->operation.MutableAttrs()->NumInputs(op->operation.Inputs().size());
+
+  TFE_OpSetAttrTypeList(launch_op.get(), "Tconstants", const_input_types.data(),
+                        const_input_types.size());
+
+  // Set Targs and Nresources attrs.
+  TFE_OpSetAttrTypeList(launch_op.get(), "Targs", arg_input_types.data(),
+                        arg_input_types.size());
+  const int num_resource_inputs = fdef->signature().input_arg_size() -
+                                  const_input_types.size() -
+                                  arg_input_types.size();
+  TFE_OpSetAttrInt(launch_op.get(), "Nresources", num_resource_inputs);
+
+  // Set Tresults attr.
+  std::vector<TF_DataType> tresults;
+  for (const OpDef::ArgDef& arg : fdef->signature().output_arg()) {
+    tresults.push_back(static_cast<TF_DataType>(arg.type()));
+  }
+  TFE_OpSetAttrTypeList(launch_op.get(), "Tresults", tresults.data(),
+                        tresults.size());
+
+  // Set function attr.
+  AttrValue attr_value;
+  NameAttrList* func = attr_value.mutable_func();
+  func->set_name(fdef->signature().name());
+  launch_op->attrs.Set("function", attr_value);
+
+  return launch_op;
+}
+#endif  // TENSORFLOW_EAGER_USE_XLA
+
+}  // namespace
+
+Status EagerExecute(EagerOperation* op,
+                    gtl::InlinedVector<TensorHandle*, 2>* retvals,
+                    int* num_retvals) {
+  EagerContext* ctx = op->EagerContext();
+  auto status = ctx->GetStatus();
+  if (!status.ok()) return status;
+#ifdef TENSORFLOW_EAGER_USE_XLA
+  std::unique_ptr<TFE_Op> xla_launch_op;
+  if (op->UseXla() && op->Name() != "_XlaLaunch") {
+    xla_launch_op = BuildXlaLaunch(op, status);
+    if (!status.ok()) return status;
+    op = xla_launch_op.get();
+  }
+#endif  // TENSORFLOW_EAGER_USE_XLA
+  // Ensure all resource-touching ops run in the device the resource is,
+  // regardless of anything else that has been specified. This is identical to
+  // the graph mode behavior.
+  for (int i = 0; i < op->Inputs().size(); ++i) {
+    Device* input_op_device = nullptr;
+    status = op->Inputs()[i]->OpDevice(&input_op_device);
+    if (!status.ok()) return status;
+    VLOG(2) << "for op " << op->Name() << " input " << i << " "
+            << DataTypeString(op->Inputs()[i]->dtype) << " "
+            << (input_op_device == nullptr ? "cpu" : input_op_device->name())
+            << " " << (op->Device() == nullptr ? "cpu" : op->Device()->name());
+    if (op->Inputs()[i]->dtype == DT_RESOURCE &&
+        (input_op_device != op->Device() || input_op_device == nullptr)) {
+      Device* d = input_op_device == nullptr ? ctx->HostCPU() : input_op_device;
+      VLOG(1) << "Changing device of operation " << op->Name() << " to "
+              << d->name() << " because input #" << i
+              << " is a resource in this device.";
+      op->SetDevice(d);
+    }
+  }
+  Device* device = op->Device();
+
+  Fprint128 cache_key = op->MutableAttrs()->CacheKey(
+      device == nullptr ? "unspecified" : device->name());
+  KernelAndDevice* kernel = ctx->GetCachedKernel(cache_key);
+  if (kernel == nullptr) {
+    const NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
+    if (device == nullptr) {
+      status = SelectDevice(ndef, ctx, &device);
+      if (!status.ok()) return status;
+    }
+    CHECK(device != nullptr);
+    if (ctx->LogDevicePlacement()) {
+      LOG(INFO) << "Executing op " << ndef.op() << " in device "
+                << device->name();
+    }
+    kernel = new KernelAndDevice(ctx->GetRendezvous());
+    // Knowledge of the implementation of Init (and in-turn
+    // FunctionLibraryRuntime::CreateKernel) tells us that ctx->func_lib_def
+    // will be accessed, so grab on to the lock.
+    // See WARNING comment in Execute (before kernel->Run) - would be nice to
+    // rework to avoid this subtlety.
+    tf_shared_lock l(*ctx->FunctionsMu());
+    status = KernelAndDevice::Init(ndef, ctx->func_lib(device), kernel);
+    if (!status.ok()) {
+      delete kernel;
+      return status;
+    }
+    // Update output_dtypes inside `kernel`.
+    const OpDef* op_def = nullptr;
+    const FunctionDef* function_def = ctx->FuncLibDef()->Find(ndef.op());
+    if (function_def != nullptr) {
+      op_def = &(function_def->signature());
+    }
+    if (op_def == nullptr) {
+      status = OpDefForOp(ndef.op().c_str(), &op_def);
+      if (!status.ok()) return status;
+    }
+    DataTypeVector input_dtypes;
+    status = InOutTypesForNode(ndef, *op_def, &input_dtypes,
+                               kernel->mutable_output_dtypes());
+    if (!status.ok()) return status;
+    ctx->AddKernelToCache(cache_key, kernel);
+  }
+  const DataTypeVector& output_dtypes = kernel->output_dtypes();
+  const int output_dtypes_size = static_cast<int>(output_dtypes.size());
+  if (output_dtypes_size > *num_retvals) {
+    return errors::InvalidArgument("Expecting ", output_dtypes.size(),
+                                   " outputs, but *num_retvals is ",
+                                   *num_retvals);
+  }
+  *num_retvals = output_dtypes_size;
+  if (device == nullptr) {
+    // TODO(apassos) debug how the assignment below might return a different
+    // device from the one requested above.
+    device = kernel->device();
+  }
+  status = ValidateInputTypeAndPlacement(
+      ctx, device, op, kernel->kernel(),
+      ctx->ShouldStoreMetadata() ? ctx->RunMetadataProto() : nullptr);
+  if (!status.ok()) return status;
+  std::unique_ptr<NodeExecStats> maybe_stats;
+  if (ctx->ShouldStoreMetadata()) {
+    maybe_stats.reset(new NodeExecStats);
+    maybe_stats->set_node_name(op->Name());
+    maybe_stats->set_all_start_micros(Env::Default()->NowMicros());
+    maybe_stats->set_op_start_rel_micros(0);
+    maybe_stats->set_scheduled_micros(Env::Default()->NowMicros());
+    // TODO(apassos) track referenced tensors
+  }
+  retvals->resize(*num_retvals);
+  if (ctx->Async()) {
+    // Note that for async mode, execution order will make sure that all
+    // input handles are ready before executing them.
+    // TODO(agarwal): Consider executing "cheap" kernels inline for performance.
+    tensorflow::uint64 id = ctx->NextId();
+    for (int i = 0; i < *num_retvals; ++i) {
+      (*retvals)[i] = new TensorHandle(id, output_dtypes[i], ctx);
+    }
+    EagerNode* node =
+        new ExecuteNode(id, ctx, op->Device(), op->Inputs(), kernel,
+                        maybe_stats.release(), output_dtypes, *retvals);
+    ctx->ExecutorAdd(node);
+  } else {
+    // Execute checks if retvals[i] is nullptr or not to figure if it needs to
+    // allocate it.
+    status = EagerExecute(ctx, op->Device(), op->Inputs(), kernel,
+                          maybe_stats.get(), retvals->data(), *num_retvals);
+  }
+
+  return status;
+}
+
 Status EagerExecute(EagerContext* ctx, Device* device,
                     const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
                     KernelAndDevice* kernel, NodeExecStats* maybe_stats,
diff --git a/tensorflow/core/common_runtime/eager/execute.h b/tensorflow/core/common_runtime/eager/execute.h
index 0f6ad031e1..7c8d7e164d 100644
--- a/tensorflow/core/common_runtime/eager/execute.h
+++ b/tensorflow/core/common_runtime/eager/execute.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
@@ -25,6 +26,12 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Utility function that executes a fully constructed EagerOperation.
+Status EagerExecute(
+    EagerOperation* op,
+    tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2>* retvals,
+    int* num_retvals);
+
 // Low-level utility to execute the kernel specified by kernel on device device,
 // with the inputs op_inputs, in the context ctx.
 Status EagerExecute(EagerContext* ctx, Device* device,
-- 
GitLab


From 60a0e2f5261cf72da4e4d8e65b56b695d611b984 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 15:19:59 -0700
Subject: [PATCH 253/450] Do not force default layout when there is no need to.
 Allow the inner computations to negotiate a root and parameter layouts
 different from default.

PiperOrigin-RevId: 193731341
---
 tensorflow/compiler/xla/service/BUILD         |   3 +
 .../xla/service/computation_layout.cc         |   7 +-
 .../compiler/xla/service/computation_layout.h |   5 +-
 .../compiler/xla/service/hlo_instruction.h    |   8 +
 .../compiler/xla/service/layout_assignment.cc | 328 +++++++++++++-----
 .../compiler/xla/service/layout_assignment.h  |  65 +++-
 tensorflow/compiler/xla/service/service.cc    |   5 +-
 .../compiler/xla/service/tuple_simplifier.cc  |  25 +-
 8 files changed, 325 insertions(+), 121 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 9555d91817..bc577c173d 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1953,10 +1953,12 @@ cc_library(
     deps = [
         ":computation_layout",
         ":hlo",
+        ":hlo_dce",
         ":hlo_graph_dumper",
         ":hlo_pass",
         ":logical_buffer",
         ":tuple_points_to_analysis",
+        ":tuple_simplifier",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -2433,6 +2435,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/computation_layout.cc b/tensorflow/compiler/xla/service/computation_layout.cc
index d2d4f14fce..cb61f3da39 100644
--- a/tensorflow/compiler/xla/service/computation_layout.cc
+++ b/tensorflow/compiler/xla/service/computation_layout.cc
@@ -23,12 +23,15 @@ limitations under the License.
 
 namespace xla {
 
-ComputationLayout::ComputationLayout(const ProgramShape& program_shape)
+ComputationLayout::ComputationLayout(const ProgramShape& program_shape,
+                                     bool ignore_layouts)
     : result_layout_(program_shape.result()) {
   for (auto& shape : program_shape.parameters()) {
     parameter_layouts_.emplace_back(shape);
   }
-  SetToDefaultLayout();
+  if (ignore_layouts) {
+    SetToDefaultLayout();
+  }
 }
 
 void ComputationLayout::SetToDefaultLayout() {
diff --git a/tensorflow/compiler/xla/service/computation_layout.h b/tensorflow/compiler/xla/service/computation_layout.h
index 80e102411c..53c3a3f7b7 100644
--- a/tensorflow/compiler/xla/service/computation_layout.h
+++ b/tensorflow/compiler/xla/service/computation_layout.h
@@ -34,8 +34,9 @@ class ComputationLayout {
  public:
   // Constructs a ComputationLayout from a ProgramShape. The layouts of the
   // parameters and results are set to the default layout. Layouts in the
-  // ProgramShape are ignored.
-  explicit ComputationLayout(const ProgramShape& program_shape);
+  // ProgramShape are ignored if ignore_layouts is true.
+  explicit ComputationLayout(const ProgramShape& program_shape,
+                             bool ignore_layouts = true);
 
   // Returns the layout of a particular parameter.
   const ShapeLayout& parameter_layout(int64 param_no) const {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index a5e9aecb9e..f3da3fc256 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -956,6 +956,14 @@ class HloInstruction {
   void clear_sharding() { sharding_ = nullptr; }
   // Return true if this operator has a sharding assigned.
   bool has_sharding() const { return sharding_ != nullptr; }
+  // Checks whether the instruction has compatible sharding with the other
+  // instruction.
+  bool has_compatible_sharding(const HloInstruction* other) const {
+    if (!has_sharding()) {
+      return !other->has_sharding();
+    }
+    return other->has_sharding() ? sharding() == other->sharding() : false;
+  }
 
   // When creating a new instruction which either replaces, or shifts up (kCopy
   // insertion case), another instruction, we need to make sure the certain
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 2494569db5..7067b6f86a 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -31,10 +31,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -400,9 +402,9 @@ string LayoutConstraints::ToString() const {
 }
 
 Status LayoutAssignment::AddMandatoryConstraints(
-    const ComputationLayout& computation_layout,
-    const ChannelLayoutConstraints* channel_constraints,
-    HloComputation* computation, LayoutConstraints* constraints) {
+    const ComputationLayout* computation_layout,
+    ChannelLayoutConstraints* channel_constraints, HloComputation* computation,
+    LayoutConstraints* constraints) {
   VLOG(3) << "Adding mandatory layout constraints to computation "
           << computation->name();
 
@@ -424,11 +426,16 @@ Status LayoutAssignment::AddMandatoryConstraints(
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
           instruction->outfeed_shape(), instruction, 0));
     } else if (instruction->opcode() == HloOpcode::kParameter) {
-      // Parameter layouts must match the respective layout in
-      // ComputationLayout.
-      shape_with_layout =
-          &computation_layout.parameter_layout(instruction->parameter_number())
-               .shape();
+      if (computation_layout != nullptr) {
+        const ShapeLayout& parameter_layout =
+            computation_layout->parameter_layout(
+                instruction->parameter_number());
+        if (parameter_layout.LayoutIsSet()) {
+          // Parameter layouts must match the respective layout in
+          // ComputationLayout, if there is one.
+          shape_with_layout = &parameter_layout.shape();
+        }
+      }
     }
     if (shape_with_layout != nullptr) {
       TF_RETURN_IF_ERROR(
@@ -493,9 +500,8 @@ Status LayoutAssignment::AddMandatoryConstraints(
       HloComputation* body = instruction->while_body();
       HloComputation* condition = instruction->while_condition();
       const HloInstruction* init = instruction->operand(0);
-      const ComputationLayout& body_layout =
-          FindOrDie(computation_layouts_, body);
-      const ComputationLayout& condition_layout =
+      ComputationLayout& body_layout = FindOrDie(computation_layouts_, body);
+      ComputationLayout& condition_layout =
           FindOrDie(computation_layouts_, condition);
 
       // Check a few invariants irrespective of layout.
@@ -508,26 +514,19 @@ Status LayoutAssignment::AddMandatoryConstraints(
                                    condition_layout.parameter_shape(0)));
       DCHECK(ShapeUtil::Compatible(body_layout.result_shape(), init->shape()));
 
-      // Return error if earlier layout assignment of the embedded computations
-      // has produced conflicting layouts.
-      if (!ShapeUtil::Equal(body_layout.result_shape(),
-                            body_layout.parameter_shape(0))) {
-        return InternalError(
-            "Parameter and result of body computation %s of while instruction "
-            "%s have different layouts: %s vs %s",
-            body->name().c_str(), instruction->name().c_str(),
-            ShapeUtil::HumanString(body_layout.result_shape()).c_str(),
-            ShapeUtil::HumanString(body_layout.parameter_shape(0)).c_str());
+      if (body_layout.result_layout() != body_layout.parameter_layout(0)) {
+        VLOG(2) << "Reset %while body parameter layout: body=" << body->name()
+                << " while=" << instruction->name()
+                << " shape=" << body_layout.result_layout().ToString();
+        *body_layout.mutable_parameter_layout(0) = body_layout.result_layout();
       }
-      if (!ShapeUtil::Equal(body->root_instruction()->shape(),
-                            condition->parameter_instruction(0)->shape())) {
-        return InternalError(
-            "Parameter of condition computation %s of while instruction "
-            "%s does not match body computation %s result: %s vs %s",
-            condition->name().c_str(), instruction->name().c_str(),
-            body->name().c_str(),
-            ShapeUtil::HumanString(condition_layout.parameter_shape(0)).c_str(),
-            ShapeUtil::HumanString(body_layout.result_shape()).c_str());
+      if (condition_layout.parameter_layout(0) !=
+          body_layout.parameter_layout(0)) {
+        VLOG(2) << "Reset %while condition parameter layout: cond="
+                << condition->name() << " while=" << instruction->name()
+                << " shape=" << body_layout.parameter_layout(0).ToString();
+        *condition_layout.mutable_parameter_layout(0) =
+            body_layout.parameter_layout(0);
       }
 
       // Constrain the output and the operand of the while instruction to match
@@ -557,7 +556,20 @@ Status LayoutAssignment::AddMandatoryConstraints(
                                    true_computation_layout.parameter_shape(0)));
       DCHECK(ShapeUtil::Compatible(
           false_operand->shape(), false_computation_layout.parameter_shape(0)));
-
+      if (true_computation_layout.result_layout() !=
+          false_computation_layout.result_layout()) {
+        // We assign layouts in DFS fashion, so the true and false computations
+        // might have negotiated a different layout. But for the conditional
+        // instruction POV the layout must match, so we run again on the false
+        // computation, this time with proper computation layout.
+        VLOG(2) << "Reset %conditional false computation result layout: "
+                   "false_computation="
+                << false_computation->name()
+                << " conditional=" << instruction->name() << " shape="
+                << true_computation_layout.result_layout().ToString();
+        *false_computation_layout.mutable_result_layout() =
+            true_computation_layout.result_layout();
+      }
       TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(
           true_computation_layout.result_shape(), instruction));
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
@@ -593,10 +605,14 @@ Status LayoutAssignment::AddMandatoryConstraints(
       }
     }
   }
-
-  // Finally set the result layout to match ComputationLayout.
-  return constraints->SetResultLayout(
-      computation_layout.result_layout().shape());
+  // Finally set the result layout to match ComputationLayout, if there is one.
+  if (computation_layout != nullptr) {
+    const ShapeLayout& result_layout = computation_layout->result_layout();
+    if (result_layout.LayoutIsSet()) {
+      TF_RETURN_IF_ERROR(constraints->SetResultLayout(result_layout.shape()));
+    }
+  }
+  return Status::OK();
 }
 
 namespace {
@@ -760,6 +776,7 @@ StatusOr<HloInstruction*> LayoutAssignment::CreateCopyWithNewLayout(
     HloInstruction* copy =
         instruction->parent()->AddInstruction(HloInstruction::CreateUnary(
             instruction->shape(), HloOpcode::kCopy, instruction));
+    RegisterAddedCopy(copy);
     SetupCopiedInstruction(*instruction, copy, {});
     LayoutUtil::ClearLayout(copy->mutable_shape());
     TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
@@ -783,13 +800,19 @@ Status LayoutAssignment::CopyOperandIfLayoutsDiffer(
   TF_RET_CHECK(LayoutUtil::HasLayout(operand->shape()));
 
   if (ShapeUtil::Equal(operand_layout.shape(), operand->shape())) {
+    VLOG(5) << "Operand " << operand->ToString() << " layout matches in "
+            << instruction->ToString();
     // Operand layout already matches our constraint. Nothing to do.
     return Status::OK();
   }
+  VLOG(4) << "Operand " << operand->ToString() << " layout does not match "
+          << operand_layout.ToString() << " in " << instruction->ToString();
 
   TF_ASSIGN_OR_RETURN(HloInstruction * operand_copy,
                       CreateCopyWithNewLayout(operand_layout.shape(), operand));
 
+  VLOG(4) << "New copy of " << operand->ToString() << " is "
+          << operand_copy->ToString();
   return instruction->ReplaceOperandWith(operand_no, operand_copy);
 }
 
@@ -896,15 +919,16 @@ Status LayoutAssignment::CheckLayouts(HloModule* module) {
       }
     }
   }
-
-  // Finally verify the result layout matches the layout of the entry
+  // Finally verify the result layout, if set, matches the layout of the entry
   // computation root.
-  TF_RET_CHECK(ShapeUtil::Equal(
-      module->entry_computation()->root_instruction()->shape(),
+  const ShapeLayout& result_layout =
       FindOrDie(computation_layouts_, module->entry_computation())
-          .result_layout()
-          .shape()));
-
+          .result_layout();
+  if (result_layout.LayoutIsSet()) {
+    TF_RET_CHECK(ShapeUtil::Equal(
+        module->entry_computation()->root_instruction()->shape(),
+        result_layout.shape()));
+  }
   return Status::OK();
 }
 
@@ -913,18 +937,13 @@ LayoutAssignment::LayoutAssignment(
     ChannelLayoutConstraints* channel_constraints)
     : entry_computation_layout_(entry_computation_layout),
       channel_layout_constraints_(channel_constraints) {
-  VLOG(1) << "entry computation layout given to layout assignment: "
+  VLOG(1) << "Entry computation layout given to layout assignment: "
           << entry_computation_layout_->ToString();
   // Layouts of all parameter instructions must be set.
   for (const ShapeLayout& parameter_layout :
        entry_computation_layout_->parameter_layouts()) {
     CHECK(parameter_layout.LayoutIsSet());
   }
-  // If the result layout is not set, then choose the default.
-  // TODO(b/29118294): Choose a better layout in this case.
-  if (!entry_computation_layout_->result_layout().LayoutIsSet()) {
-    entry_computation_layout_->mutable_result_layout()->SetToDefaultLayout();
-  }
 }
 
 std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
@@ -1484,16 +1503,60 @@ Status LayoutAssignment::AssignLayouts(const LayoutConstraints& constraints,
   return Status::OK();
 }
 
+Status LayoutAssignment::CalculateComputationLayout(
+    HloComputation* computation) {
+  ComputationLayout computation_layout(computation->ComputeProgramShape(),
+                                       /*ignore_layouts=*/false);
+  InsertOrDie(&computation_layouts_, computation, computation_layout);
+  VLOG(2) << "  Calculated ComputationLayout = "
+          << computation_layout.ToString();
+  return Status::OK();
+}
+
+Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) {
+  // Clear existing layouts of the instructions.  All layouts must be assigned
+  // by the LayoutAssignment pass, except for those on infeeds, parameters,
+  // and the computation result. The latter two are specified in
+  // computation_layout, so we only need to keep the existing layouts for
+  // infeeds.  Clearing the layouts here avoids hiding potential bugs in the
+  // layout assignment pass that may accidently use the existing layout.
+  for (HloInstruction* instruction : computation->instructions()) {
+    if (instruction->opcode() == HloOpcode::kBitcast) {
+      // bitcasts are inherently layout sensitive and so a bitcast instruction
+      // present in the IR before layout assignment is a bug.
+      return InternalError(
+          "Unexpected bitcast operation seen during layout assignment: %s.",
+          instruction->ToString().c_str());
+    }
+    if (instruction->opcode() != HloOpcode::kInfeed) {
+      LayoutUtil::ClearLayout(instruction->mutable_shape());
+    }
+  }
+  return Status::OK();
+}
+
 Status LayoutAssignment::RunOnComputation(
-    const ComputationLayout& computation_layout,
+    ComputationLayout* computation_layout,
     const TuplePointsToAnalysis& points_to_analysis,
     HloComputation* computation,
     ChannelLayoutConstraints* channel_constraints) {
-  DCHECK(computation_layout.LayoutIsSet());
-  InsertOrDie(&computation_layouts_, computation, computation_layout);
   VLOG(2) << "LayoutAssignment::RunOnComputation(" << computation->name()
           << ")";
-  VLOG(2) << "  ComputationLayout = " << computation_layout.ToString();
+  TF_RETURN_IF_ERROR(ClearComputationLayouts(computation));
+  if (computation_layout != nullptr) {
+    auto it = computation_layouts_.find(computation);
+    if (it == computation_layouts_.end()) {
+      VLOG(2) << "  New ComputationLayout = " << computation_layout->ToString();
+      computation_layouts_.emplace(computation, *computation_layout);
+    } else {
+      TF_RET_CHECK(computation_layout == &it->second ||
+                   computation_layout == entry_computation_layout_);
+      VLOG(2) << "  Existing ComputationLayout = "
+              << computation_layout->ToString();
+    }
+  } else {
+    VLOG(2) << "  No ComputationLayout specified (will be calculated)";
+  }
 
   // Construct LayoutConstraints with all layout constraints of the computation.
   LayoutConstraints constraints(points_to_analysis, computation);
@@ -1536,12 +1599,19 @@ Status LayoutAssignment::RunOnComputation(
     CHECK_LT(constraints.unconstrained_buffer_ids().size(),
              unconstrained_count);
   }
-
   // All logical buffers should have constraints at this point. All that
   // remains is assign the constraints to the buffers and infer layouts for
   // aliased buffers.
   TF_RETURN_IF_ERROR(AssignLayouts(constraints, computation));
 
+  // If the computation layout wasn't specified, now it is the time to compute
+  // it according to the parameters and root instruction layouts.
+  // This allows the first pass through this API to record the best flowing
+  // layout to parameters and root instruction.
+  if (computation_layout == nullptr) {
+    TF_RETURN_IF_ERROR(CalculateComputationLayout(computation));
+  }
+
   // Record the layouts assigned for any communication ops in
   // channel_constraints so that they are constrained for future modules.
   for (HloInstruction* instruction : computation->instructions()) {
@@ -1556,6 +1626,34 @@ Status LayoutAssignment::RunOnComputation(
   return Status::OK();
 }
 
+Status LayoutAssignment::PropagateComputationLayouts(
+    HloComputation* computation, ComputationLayout* computation_layout) {
+  ComputationLayout computed_computation_layout(
+      computation->ComputeProgramShape(),
+      /*ignore_layouts=*/false);
+  for (int64 i = 0; i < computed_computation_layout.parameter_count(); ++i) {
+    ShapeLayout* param_layout = computation_layout->mutable_parameter_layout(i);
+    if (!param_layout->LayoutIsSet()) {
+      VLOG(4) << "Assigning layout to parameter " << i << " of computation "
+              << computation->name() << ": "
+              << computed_computation_layout.parameter_layout(i).ToString();
+      *param_layout = computed_computation_layout.parameter_layout(i);
+    } else {
+      TF_RET_CHECK(computed_computation_layout.parameter_layout(i) ==
+                   *param_layout);
+    }
+  }
+  ShapeLayout* result_layout = computation_layout->mutable_result_layout();
+  if (!result_layout->LayoutIsSet()) {
+    VLOG(4) << "Assigning result layout of computation " << computation->name()
+            << ": " << computed_computation_layout.result_layout().ToString();
+    *result_layout = computed_computation_layout.result_layout();
+  } else {
+    TF_RET_CHECK(computed_computation_layout.result_layout() == *result_layout);
+  }
+  return Status::OK();
+}
+
 StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
   VLOG(2) << "Running layout assignment on module " << module->name();
   XLA_VLOG_LINES(3, module->ToString());
@@ -1564,52 +1662,45 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
                                 "before layout assignment",
                                 module->config().debug_options());
   }
-
-  TF_ASSIGN_OR_RETURN(auto points_to_analysis,
-                      TuplePointsToAnalysis::Run(module));
-
-  // Assign layouts to computations in an order such that a callee computation
-  // is handled before its caller computation. This ensures that the layout of
-  // all callers of a computation will agree.
-  std::list<HloComputation*> computation_post_order =
-      module->MakeComputationPostOrder();
-  for (auto* computation : module->MakeComputationPostOrder()) {
-    if (computation->IsFusionComputation()) {
-      continue;
-    }
-    // Clear existing layouts of the instructions.  All layouts must be assigned
-    // by the LayoutAssignment pass, except for those on infeeds, parameters,
-    // and the computation result. The latter two are specified in
-    // computation_layout, so we only need to keep the existing layouts for
-    // infeeds.  Clearing the layouts here avoids hiding potential bugs in the
-    // layout assignment pass that may accidently use the existing layout.
-    for (HloInstruction* instruction : computation->instructions()) {
-      if (instruction->opcode() == HloOpcode::kBitcast) {
-        // bitcasts are inherently layout sensitive and so a bitcast instruction
-        // present in the IR before layout assignment is a bug.
-        return InternalError(
-            "Unexpected bitcast operation seen during layout assignment: %s.",
-            instruction->ToString().c_str());
+  TF_RETURN_IF_ERROR(Init());
+
+  // We do two passes. The first one we pass a nullptr ComputationLayout to
+  // the RunOnComputation() calls (for non entry computations), and we register
+  // the ComputationLayout which are naturally flowing in DFS fashion to the
+  // parameters and root instruction.
+  // Walking in DFS mode though, means that we can end up with incorrect layouts
+  // when seen from an outer instruction, which has across-computation
+  // constraints to impose.
+  // For example, the kWhile instruction needs to enforce the same layouts for
+  // the parameters and root of the bosy, as well as the condition parameters.
+  // Similarly, the kConditional instruction needs to enforce the same layouts
+  // for the root of the true and false computations.
+  // So in the first pass, while allowing the layouts to flow to parameters and
+  // root, we also fix up the eventually inconsistent ComputationLayout, which
+  // will be then made mandatory by the second pass.
+  for (int64 i = 0; i < 2; ++i) {
+    TF_RETURN_IF_ERROR(ClearPreviousPassSideEffects(module));
+    TF_ASSIGN_OR_RETURN(auto points_to_analysis,
+                        TuplePointsToAnalysis::Run(module));
+    for (auto* computation : module->MakeComputationPostOrder()) {
+      if (computation->IsFusionComputation()) {
+        continue;
       }
-      if (instruction->opcode() != HloOpcode::kInfeed) {
-        LayoutUtil::ClearLayout(instruction->mutable_shape());
+      if (computation == module->entry_computation()) {
+        TF_RETURN_IF_ERROR(RunOnComputation(
+            entry_computation_layout_, *points_to_analysis,
+            module->entry_computation(), channel_layout_constraints_));
+      } else {
+        ComputationLayout* computation_layout =
+            (i == 0) ? nullptr : &FindOrDie(computation_layouts_, computation);
+        TF_RETURN_IF_ERROR(RunOnComputation(computation_layout,
+                                            *points_to_analysis, computation,
+                                            channel_layout_constraints_));
       }
     }
-    if (computation == module->entry_computation()) {
-      TF_RETURN_IF_ERROR(RunOnComputation(
-          *entry_computation_layout_, *points_to_analysis,
-          module->entry_computation(), channel_layout_constraints_));
-    } else {
-      ComputationLayout computation_layout(computation->ComputeProgramShape());
-      // Setting all embedded computations to the default layout is potentially
-      // suboptimal.
-      computation_layout.SetToDefaultLayout();
-      TF_RETURN_IF_ERROR(RunOnComputation(computation_layout,
-                                          *points_to_analysis, computation,
-                                          channel_layout_constraints_));
-    }
   }
-
+  TF_RETURN_IF_ERROR(PropagateComputationLayouts(module->entry_computation(),
+                                                 entry_computation_layout_));
   TF_RETURN_IF_ERROR(CheckLayouts(module));
 
   VLOG(3) << "After layout assignment:";
@@ -1619,9 +1710,54 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
                                 "after layout assignment",
                                 module->config().debug_options());
   }
-
   // All layouts are reset then reassigned by this pass.
   return true;
 }
 
+Status LayoutAssignment::Init() {
+  computation_layouts_.clear();
+  return Status::OK();
+}
+
+Status LayoutAssignment::ClearPreviousPassSideEffects(HloModule* module) {
+  // Clear all the copies which have been added, and all the related
+  // instructions (like GTE and tuples).
+  int64 removed_copies = 0;
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      if (instruction->opcode() == HloOpcode::kCopy &&
+          added_copies_.count(instruction) > 0) {
+        VLOG(5) << "Removing added copy: " << instruction->ToString();
+        TF_RETURN_IF_ERROR(
+            instruction->ReplaceAllUsesWith(instruction->mutable_operand(0)));
+        TF_RETURN_IF_ERROR(computation->RemoveInstruction(instruction));
+        ++removed_copies;
+      }
+    }
+  }
+  added_copies_.clear();
+  if (removed_copies > 0) {
+    TupleSimplifier tuple_simplifier;
+    HloDCE dce;
+    TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
+    TF_RETURN_IF_ERROR(dce.Run(module).status());
+  }
+  return Status::OK();
+}
+
+Status LayoutAssignment::AddCopyForOperand(HloInstruction* instruction,
+                                           int64 operand_number) {
+  HloInstruction* operand = instruction->mutable_operand(operand_number);
+  if (operand->opcode() != HloOpcode::kCopy || operand->user_count() > 1) {
+    HloInstruction* copy =
+        instruction->parent()->AddInstruction(HloInstruction::CreateUnary(
+            operand->shape(), HloOpcode::kCopy, operand));
+    SetupCopiedInstruction(*operand, copy, {});
+    LayoutUtil::ClearLayout(copy->mutable_shape());
+    TF_RETURN_IF_ERROR(instruction->ReplaceOperandWith(operand_number, copy));
+  }
+  return Status::OK();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index ae4986d6ad..8b4e07995a 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -362,12 +363,15 @@ class LayoutAssignment : public HloPassInterface {
       int64 operand_no);
 
  private:
+  // Initializes the layout assignment object for a new Run() call.
+  Status Init();
+
   // Adds constraints which must be satisfied for correctness on all
   // backends. Called once prior to propagating constraints.
-  Status AddMandatoryConstraints(
-      const ComputationLayout& computation_layout,
-      const ChannelLayoutConstraints* channel_constraints,
-      HloComputation* computation, LayoutConstraints* constraints);
+  Status AddMandatoryConstraints(const ComputationLayout* computation_layout,
+                                 ChannelLayoutConstraints* channel_constraints,
+                                 HloComputation* computation,
+                                 LayoutConstraints* constraints);
 
   // This method can be overridden to add backend-specific constraints to the
   // layout of the instructions of a computation. This method is called after
@@ -378,10 +382,12 @@ class LayoutAssignment : public HloPassInterface {
   }
 
   // Construct contraints and assign layouts to all instructions in the
-  // computation satisfying the given ComputationLayout. Layouts constraints are
-  // added, then propagated until all LogicalBuffers in the computation are
-  // constrained.
-  Status RunOnComputation(const ComputationLayout& computation_layout,
+  // computation satisfying the given ComputationLayout, if not nullptr.
+  // Otherwise the ComputationLayout will be calculated by propagating the
+  // computation instruction contraints.
+  // Layouts constraints are added, then propagated until all LogicalBuffers in
+  // the computation are constrained.
+  Status RunOnComputation(ComputationLayout* computation_layout,
                           const TuplePointsToAnalysis& points_to_analysis,
                           HloComputation* computation,
                           ChannelLayoutConstraints* channel_constraints);
@@ -402,6 +408,25 @@ class LayoutAssignment : public HloPassInterface {
   // necessary conditions.
   Status CheckLayouts(HloModule* module);
 
+  // Computes the ComputationLayout of the given computation based of the
+  // layouts assigned to parameters and root instruction, and inserts it to the
+  // computation_layouts_ map.
+  Status CalculateComputationLayout(HloComputation* computation);
+
+  // Clears all the layouts which can be cleared within a computation.
+  Status ClearComputationLayouts(HloComputation* computation);
+
+  // Clears the side effects of a previous pass, like added copy instructions.
+  Status ClearPreviousPassSideEffects(HloModule* module);
+
+  // Propagates the layouts computed by the layout assignment pass on the given
+  // computation, to the computation layout passed in to this API.
+  // This API propagates missing layout, and also checks that the caller
+  // specified have been respected, by comparing those with the parameters and
+  // root computation instruction.
+  Status PropagateComputationLayouts(HloComputation* computation,
+                                     ComputationLayout* computation_layout);
+
   ComputationLayout* entry_computation_layout_;
 
  protected:
@@ -418,21 +443,37 @@ class LayoutAssignment : public HloPassInterface {
   // Creates and returns a copy of the given instruction with a different
   // layout. Tuple-shaped instructions will be deep-copied, and the last Tuple
   // instruction producing the copy is returned.
-  static StatusOr<HloInstruction*> CreateCopyWithNewLayout(
+  StatusOr<HloInstruction*> CreateCopyWithNewLayout(
       const Shape& shape_with_layout, HloInstruction* instruction);
 
   // Creates a copy of the given operand if the operand's layout does not match
   // the given layout. This copy replaces the use in the given instruction.
   // Tuple operands will be deep-copied.
-  static Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
-                                           HloInstruction* instruction,
-                                           int64 operand_no);
+  Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
+                                    HloInstruction* instruction,
+                                    int64 operand_no);
+
+  // Registers a copy instruction added by the layout assignment pass.
+  void RegisterAddedCopy(HloInstruction* copy) {
+    CHECK_EQ(copy->opcode(), HloOpcode::kCopy);
+    added_copies_.insert(copy);
+  }
+
+  // Adds a copy for the operand of an instruction, unless such operand is
+  // already a copy, and has a single user (which is forcibly the instruction
+  // itself).
+  Status AddCopyForOperand(HloInstruction* instruction, int64 operand_number);
 
   // Map containing the layouts of all computations assigned so
   // far. Computations are handled in a topological sort where computations are
   // handled before their caller instructions so the layouts of caller
   // instructions can be set to match the computation.
   std::map<HloComputation*, ComputationLayout> computation_layouts_;
+
+  // Every copy added to the module by the layout assignment pass is registered
+  // here.
+  tensorflow::gtl::FlatSet<HloInstruction*> added_copies_;
+
   ChannelLayoutConstraints* channel_layout_constraints_;
 };
 
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 39f3aefdf8..a73118c68a 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -308,7 +308,10 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
         computation_layout->mutable_result_layout()->CopyLayoutFromShape(
             shape_with_output_layout));
   } else {
-    computation_layout->mutable_result_layout()->Clear();
+    // TODO(b/78356948): We are forcing the default layout here. We should fix
+    // clients which expect a default layout, to be explicit about it, by
+    // passing the proper ExecutionOptions with shape_with_output_layout set.
+    computation_layout->mutable_result_layout()->SetToDefaultLayout();
   }
 
   config->set_replica_count(options_.number_of_replicas());
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.cc b/tensorflow/compiler/xla/service/tuple_simplifier.cc
index 113c2e2bd9..d668855084 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.cc
@@ -69,6 +69,7 @@ StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
       //       Tuple
       //
       HloInstruction* top_tuple = nullptr;
+      HloInstruction* first_gte = nullptr;
       bool can_simplify = true;
       for (int64 operand_number = 0;
            operand_number < instruction->operand_count(); ++operand_number) {
@@ -78,11 +79,17 @@ StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
           can_simplify = false;
           break;
         }
-
+        if (first_gte == nullptr) {
+          first_gte = operand;
+        } else if (!first_gte->has_compatible_sharding(operand)) {
+          can_simplify = false;
+          break;
+        }
         if (top_tuple == nullptr) {
           top_tuple = operand->mutable_operand(0);
           if (!ShapeUtil::Compatible(top_tuple->shape(),
-                                     instruction->shape())) {
+                                     instruction->shape()) ||
+              !instruction->has_compatible_sharding(top_tuple)) {
             can_simplify = false;
             break;
           }
@@ -108,15 +115,17 @@ StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
       //          |
       //         GTE
       if (instruction->operand(0)->opcode() == HloOpcode::kTuple) {
-        changed = true;
         HloInstruction* element_source =
             instruction->mutable_operand(0)->mutable_operand(
                 instruction->tuple_index());
-        TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(element_source));
-        for (HloInstruction* user : element_source->users()) {
-          if (user->opcode() == HloOpcode::kTuple ||
-              user->opcode() == HloOpcode::kGetTupleElement) {
-            worklist.push(user);
+        if (instruction->has_compatible_sharding(element_source)) {
+          changed = true;
+          TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(element_source));
+          for (HloInstruction* user : element_source->users()) {
+            if (user->opcode() == HloOpcode::kTuple ||
+                user->opcode() == HloOpcode::kGetTupleElement) {
+              worklist.push(user);
+            }
           }
         }
       }
-- 
GitLab


From 6af31f6260161bab02db83d7e9e1d7ba7fd14b2c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 15:20:37 -0700
Subject: [PATCH 254/450] [XLA] Redesign: add comparator and printer for the
 XlaOp.

This is to prepare the migration of tf2xla. There were some codes used ComputationDataHandle::handle() for comparison/printing. Now implement XlaOp's comparator and printer.

PiperOrigin-RevId: 193731437
---
 .../compiler/xla/client/xla_client/xla_builder.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
index 5977ee4f4b..4955f1515d 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
@@ -57,11 +57,27 @@ class XlaOp {
 
   StatusOr<Shape> GetShape() const;
 
+  const XlaBuilder* builder() const { return builder_; }
+
+  bool operator==(const XlaOp& rhs) const {
+    return handle_ == rhs.handle_ && builder_ == rhs.builder_;
+  }
+
+  bool operator!=(const XlaOp& rhs) const {
+    return handle_ != rhs.handle_ || builder_ != rhs.builder_;
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const XlaOp& op) {
+    out << op.handle();
+    return out;
+  }
+
  private:
   XlaOp(int64 handle, XlaBuilder* builder)
       : handle_(handle), builder_(builder) {}
 
   int64 handle() const { return handle_; }
+
   friend class XlaBuilder;
 
   int64 handle_;
-- 
GitLab


From cadbb0b70b9441388a04533433245ac85f2887a9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 15:32:32 -0700
Subject: [PATCH 255/450] [XLA] Redesign: implement DumpToDirectory for the
 HloSession.

This is to prepare the migration of tf2xla.

PiperOrigin-RevId: 193733029
---
 tensorflow/compiler/xla/service/BUILD         |  1 +
 tensorflow/compiler/xla/service/executable.cc | 20 +++++++++++++++++++
 tensorflow/compiler/xla/service/executable.h  |  5 +++++
 3 files changed, 26 insertions(+)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index bc577c173d..afb344e5ae 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -755,6 +755,7 @@ cc_library(
         ":hlo",
         ":hlo_execution_profile",
         ":hlo_graph_dumper",
+        ":hlo_proto",
         ":pool",
         ":session_proto",
         ":shaped_buffer",
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index b097ef79cc..8218b5f7c8 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -163,4 +163,24 @@ Status Executable::DumpSessionModule() {
                                        result);
 }
 
+/* static */ Status Executable::DumpToDirectory(const string& directory_path,
+                                                string filename,
+                                                const HloSession& hlo_session) {
+  tensorflow::Env* env = tensorflow::Env::Default();
+  if (!env->IsDirectory(directory_path).ok()) {
+    // NB! CreateDir does not work reliably with multiple XLA threads -- two
+    // threads can race to observe the absence of the dump directory and
+    // simultaneously try to create it, causing the "losing" thread to get a
+    // "directory already exists" error.
+    TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(directory_path));
+  }
+  filename = SanitizeFileName(std::move(filename));
+  string file_path = tensorflow::io::JoinPath(directory_path, filename);
+  string result;
+  TF_RET_CHECK(
+      tensorflow::SerializeToStringDeterministic(hlo_session, &result));
+  return tensorflow::WriteStringToFile(tensorflow::Env::Default(), file_path,
+                                       result);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 9c725f21d8..bdbe119120 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -155,6 +156,10 @@ class Executable {
   static Status DumpToDirectory(const string& directory_path, string filename,
                                 const SessionModule& session_module);
 
+  // Dump hlo_session to directory_path/filename.
+  static Status DumpToDirectory(const string& directory_path, string filename,
+                                const HloSession& hlo_session);
+
  protected:
   mutable tensorflow::mutex mutex_;
 
-- 
GitLab


From b2f786867dca85b6b848f09f2c1d40dd123fc0fc Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 20 Apr 2018 15:38:06 -0700
Subject: [PATCH 256/450] Always use the local worker name in
 CreateWorkerSession when not doing ClusterSpec propagation.

Previously, the master would send a job name and task index in an
otherwise-empty ServerDef, and the worker would unquestioningly use
those to build its worker name. However, this would lead to errors if
the worker had a local name like "/job:worker/replica:1/task:0",
because the ServerDef doesn't support non-zero replica IDs, and so the
local worker would end up an inconsistent view of what its worker name
should be. In particular `WorkerSession::worker_name` would disagree
with the device names added during graph partitioning by the master,
which would lead to runtime failures ("InvalidArgumentError: Invalid
rendezvous key").

PiperOrigin-RevId: 193733855
---
 tensorflow/core/distributed_runtime/BUILD     |  1 +
 .../distributed_runtime/master_session.cc     | 28 +++++++++---------
 .../core/distributed_runtime/session_mgr.cc   |  6 ++--
 .../distributed_runtime/session_mgr_test.cc   | 29 +++++++++++++++++++
 4 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index d564727da5..343dd5d456 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -145,6 +145,7 @@ tf_cc_test(
     deps = [
         ":session_mgr",
         ":worker_env",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index ebe350d313..e3022f38a2 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -1219,17 +1219,6 @@ Status MasterSession::CreateWorkerSessions(
     workers[i].name = &worker_names[i];
     workers[i].worker = worker_cache->CreateWorker(worker_names[i]);
     workers[i].request.set_session_handle(handle_);
-    if (options.cluster_def) {
-      *workers[i].request.mutable_server_def()->mutable_cluster() =
-          *options.cluster_def;
-      workers[i].request.mutable_server_def()->set_protocol(*options.protocol);
-      // Session state is always isolated when ClusterSpec propagation
-      // is in use.
-      workers[i].request.set_isolate_session_state(true);
-    } else {
-      workers[i].request.set_isolate_session_state(
-          session_opts_.config.isolate_session_state());
-    }
 
     DeviceNameUtils::ParsedName name;
     if (!DeviceNameUtils::ParseFullName(worker_names[i], &name)) {
@@ -1243,8 +1232,21 @@ Status MasterSession::CreateWorkerSessions(
       return status;
     }
 
-    workers[i].request.mutable_server_def()->set_job_name(name.job);
-    workers[i].request.mutable_server_def()->set_task_index(name.task);
+    if (options.cluster_def) {
+      *workers[i].request.mutable_server_def()->mutable_cluster() =
+          *options.cluster_def;
+      workers[i].request.mutable_server_def()->set_protocol(*options.protocol);
+      workers[i].request.mutable_server_def()->set_job_name(name.job);
+      workers[i].request.mutable_server_def()->set_task_index(name.task);
+      // Session state is always isolated when ClusterSpec propagation
+      // is in use.
+      workers[i].request.set_isolate_session_state(true);
+    } else {
+      // NOTE(mrry): Do not set any component of the ServerDef,
+      // because the worker will use its local configuration.
+      workers[i].request.set_isolate_session_state(
+          session_opts_.config.isolate_session_state());
+    }
   }
 
   for (size_t i = 0; i < worker_names.size(); ++i) {
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 357e9f8930..7ef4206c78 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -43,6 +43,7 @@ SessionMgr::SessionMgr(
               new GraphMgr(worker_env, worker_env->device_mgr)))),
       worker_cache_factory_(std::move(worker_cache_factory)) {}
 
+/* static */
 string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
   return strings::StrCat("/job:", server_def.job_name(), "/replica:0/task:",
                          server_def.task_index());
@@ -56,13 +57,14 @@ Status SessionMgr::CreateSession(const string& session,
     return errors::InvalidArgument("Session must be non-empty.");
   }
 
-  const string worker_name = WorkerNameFromServerDef(server_def);
-
   WorkerCacheInterface* worker_cache = nullptr;
+  string worker_name;
   if (server_def.cluster().job().empty()) {
     worker_cache = new WorkerCacheWrapper(default_worker_cache_.get());
+    worker_name = legacy_session_->worker_name;
   } else {
     TF_RETURN_IF_ERROR(worker_cache_factory_(server_def, &worker_cache));
+    worker_name = WorkerNameFromServerDef(server_def);
   }
 
   if (worker_cache != nullptr & default_worker_cache_.get() != nullptr) {
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index 0da333833a..99192119a6 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
 
 namespace tensorflow {
 
@@ -77,6 +78,34 @@ TEST_F(SessionMgrTest, CreateSessionSimple) {
   TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
 }
 
+TEST_F(SessionMgrTest, CreateSessionClusterDefWorkerName) {
+  ServerDef server_def;
+  server_def.set_job_name("worker");
+  server_def.set_task_index(3);
+  auto job = server_def.mutable_cluster()->add_job();
+  job->set_name("worker");
+  job->mutable_tasks()->insert({3, "localhost:3333"});
+
+  string session_handle = "test_session_handle";
+  TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def, true));
+  std::shared_ptr<WorkerSession> session;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession(session_handle, &session));
+  EXPECT_NE(nullptr, session) << "Session for " << session_handle << "was null";
+  EXPECT_EQ("/job:worker/replica:0/task:3", session->worker_name);
+  TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
+}
+
+TEST_F(SessionMgrTest, CreateSessionDefaultWorkerName) {
+  ServerDef server_def;
+  string session_handle = "test_session_handle";
+  TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def, true));
+  std::shared_ptr<WorkerSession> session;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession(session_handle, &session));
+  EXPECT_NE(nullptr, session) << "Session for " << session_handle << "was null";
+  EXPECT_EQ("/job:mnist/replica:0/task:0", session->worker_name);
+  TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
+}
+
 TEST_F(SessionMgrTest, CreateSessionIsolateSessionState) {
   ServerDef server_def;
   server_def.set_job_name("worker");
-- 
GitLab


From c015a45646029f8c116028505f2da9e023b5c2b7 Mon Sep 17 00:00:00 2001
From: Brennan Saeta <saeta@google.com>
Date: Fri, 20 Apr 2018 15:51:16 -0700
Subject: [PATCH 257/450] Support legacy clusters

PiperOrigin-RevId: 193735742
---
 .../cluster_resolver/python/training/tpu_cluster_resolver.py | 2 +-
 .../python/training/tpu_cluster_resolver_test.py             | 3 +--
 tensorflow/contrib/tpu/python/tpu/tpu_config.py              | 5 +++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index 5a2771229d..1403483d28 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -245,7 +245,7 @@ class TPUClusterResolver(ClusterResolver):
     else:
       if not self._tpu.startswith(compat.as_bytes('grpc://')):
         # Case 3.
-        return server_lib.ClusterSpec({})
+        return None
       # Case 2.
       cluster_spec = {self._job_name: [self._tpu[len(
           compat.as_bytes('grpc://')):]]}
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
index dff7a03b68..5b3f9be5a1 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
@@ -356,8 +356,7 @@ class TPUClusterResolverTest(test.TestCase):
     tpu_cluster_resolver = TPUClusterResolver(tpu='/bns/foo/bar')
     self.assertEqual(
         compat.as_bytes('/bns/foo/bar'), tpu_cluster_resolver.master())
-    self.assertEqual(
-        server_lib.ClusterSpec({}), tpu_cluster_resolver.cluster_spec())
+    self.assertEqual(None, tpu_cluster_resolver.cluster_spec())
 
   def testGkeEnvironment(self):
     os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = 'grpc://10.120.27.5:8470'
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index cc1a7fd801..6d7331e3c7 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -210,8 +210,9 @@ class RunConfig(run_config_lib.RunConfig):
         raise ValueError(
             'You cannot provide a ClusterResolver and '
             'session_config.cluster_def.')
-      self._session_config.cluster_def.CopyFrom(
-          self._cluster_spec.as_cluster_def())
+      if self._cluster_spec:
+        self._session_config.cluster_def.CopyFrom(
+            self._cluster_spec.as_cluster_def())
 
   @property
   def evaluation_master(self):
-- 
GitLab


From a0071844d0af47f22ab512363b56383acf762dff Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 16:05:47 -0700
Subject: [PATCH 258/450] Remove protected data members from
 GraphOptimizerStage.

PiperOrigin-RevId: 193737654
---
 .../optimizers/arithmetic_optimizer.cc        | 54 +++++++++----------
 .../optimizers/graph_optimizer_stage.h        |  5 +-
 2 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 232132e1e8..ed199c1ac8 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -294,8 +294,8 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
       for (int i = src->input_size() - 1; i >= 0; --i) {
         if (IsControlInput(src->input(i))) {
           *target_node->add_input() = src->input(i);
-          ctx_.node_map->AddOutput(NodeName(src->input(i)),
-                                   target_node->name());
+          ctx().node_map->AddOutput(NodeName(src->input(i)),
+                                    target_node->name());
         } else {
           break;
         }
@@ -442,7 +442,7 @@ class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
   // TODO(ezhulenev): move to GraphOptimizerStage?
   bool DrivesControlDependency(const NodeDef& node) const {
     int position;
-    for (const NodeDef* output : ctx_.node_map->GetOutputs(node.name())) {
+    for (const NodeDef* output : ctx().node_map->GetOutputs(node.name())) {
       for (int i = 0; i < output->input_size(); ++i) {
         auto input = output->input(i);
         string name = ParseNodeName(input, &position);
@@ -476,8 +476,8 @@ class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
   }
 
   bool IsInPreserveSet(const NodeDef& node) const {
-    return ctx_.nodes_to_preserve->find(node.name()) !=
-           ctx_.nodes_to_preserve->end();
+    return ctx().nodes_to_preserve->find(node.name()) !=
+           ctx().nodes_to_preserve->end();
   }
 
   bool IsAlreadyOptimized(const NodeDef& node) const {
@@ -546,7 +546,7 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
     // with a single output data consumer (presumably if we reach this node from
     // previously absorbed or a root node, it means that this node is not used
     // as an input to any other op, outside of the group)
-    if (NumNonControlDataOutputs(node, *ctx_.node_map) != 1) {
+    if (NumNonControlDataOutputs(node, *ctx().node_map) != 1) {
       return false;
     }
     // All input shapes must be broadcastable to the node shape
@@ -685,7 +685,7 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
     (*node->mutable_attr())["N"].set_i(inputs.size());
 
     for (const auto& inputAndShape : inputs) {
-      ctx_.node_map->AddOutput(inputAndShape.input, node_name);
+      ctx().node_map->AddOutput(inputAndShape.input, node_name);
       node->add_input(inputAndShape.input);
     }
 
@@ -707,8 +707,8 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
     node->set_device(root_node.device());
     (*node->mutable_attr())["T"].set_type(dtype);
 
-    ctx_.node_map->AddOutput(left.input, node_name);
-    ctx_.node_map->AddOutput(right.input, node_name);
+    ctx().node_map->AddOutput(left.input, node_name);
+    ctx().node_map->AddOutput(right.input, node_name);
 
     node->add_input(left.input);
     node->add_input(right.input);
@@ -784,20 +784,20 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
           new_outer_node->set_input(1, new_add_node->name());
         }
 
-        ctx_.node_map->AddOutput(common_factor, new_outer_node->name());
-        ctx_.node_map->AddOutput(new_add_node->name(), new_outer_node->name());
+        ctx().node_map->AddOutput(common_factor, new_outer_node->name());
+        ctx().node_map->AddOutput(new_add_node->name(), new_outer_node->name());
 
         // Hoist non-shared factors up into the new AddN node.
         for (int i = 0; i < unique_factors.size(); ++i) {
           const string& unique_factor_i = unique_factors[i];
           new_add_node->set_input(i, unique_factor_i);
-          ctx_.node_map->AddOutput(unique_factor_i, new_add_node->name());
+          ctx().node_map->AddOutput(unique_factor_i, new_add_node->name());
         }
 
         // Add control deps on add node
         for (const string& ctrl_dep : ctrl_deps) {
           *new_add_node->add_input() = ctrl_dep;
-          ctx_.node_map->AddOutput(NodeName(ctrl_dep), new_add_node->name());
+          ctx().node_map->AddOutput(NodeName(ctrl_dep), new_add_node->name());
         }
 
         // optimize new inner aggregation node
@@ -931,8 +931,8 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
     // if graph rewrite happens in multiple passes without graph pruning between
     // them, it's possible that rewritten node already exists in a graph
     return rewritten_nodes_.find(node->name()) != rewritten_nodes_.end() ||
-           ctx_.node_map->NodeExists(OuterNodeName(node, false)) ||
-           ctx_.node_map->NodeExists(OuterNodeName(node, true));
+           ctx().node_map->NodeExists(OuterNodeName(node, false)) ||
+           ctx().node_map->NodeExists(OuterNodeName(node, true));
   }
 
   // keep names of the nodes that were optimized by this stage
@@ -996,7 +996,7 @@ class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage {
     }
     // Optimized nodes updated in place, and that would break the graph, if the
     // node has multiple output consumers
-    if (NumNonControlOutputs(node, *ctx_.node_map) != 1) {
+    if (NumNonControlOutputs(node, *ctx().node_map) != 1) {
       return false;
     }
     // All input shapes must be broadcastable to the node shape
@@ -1120,13 +1120,13 @@ class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage {
       node->set_input(0, input_0);
       node->set_input(1, input_1);
       // Invalidate node properties (shape)
-      ctx_.graph_properties->ClearOutputProperties(node->name());
-      ctx_.graph_properties->ClearInputProperties(node->name());
+      ctx().graph_properties->ClearOutputProperties(node->name());
+      ctx().graph_properties->ClearInputProperties(node->name());
       // Update the node map
-      ctx_.node_map->RemoveOutput(NodeName(old_input_0), node->name());
-      ctx_.node_map->RemoveOutput(NodeName(old_input_1), node->name());
-      ctx_.node_map->AddOutput(NodeName(input_0), node->name());
-      ctx_.node_map->AddOutput(NodeName(input_1), node->name());
+      ctx().node_map->RemoveOutput(NodeName(old_input_0), node->name());
+      ctx().node_map->RemoveOutput(NodeName(old_input_1), node->name());
+      ctx().node_map->AddOutput(NodeName(input_0), node->name());
+      ctx().node_map->AddOutput(NodeName(input_1), node->name());
       // Add updated node to optimization queue
       AddToOptimizationQueue(node);
     }
@@ -1257,8 +1257,8 @@ class RemoveRedundantBitcastStage : public ArithmeticOptimizerStage {
       // Bitcast(Bitcast(x, type1), type2) => Bitcast(x, type2)
       bitcast->set_input(0, operand->input(0));
       SetSourceDataType(GetSourceDataType(*operand), bitcast);
-      ctx_.node_map->UpdateInput(bitcast->name(), bitcast->input(0),
-                                 operand->input(0));
+      ctx().node_map->UpdateInput(bitcast->name(), bitcast->input(0),
+                                  operand->input(0));
       AddToOptimizationQueue(bitcast);
       *simplified_node_name = bitcast->name();
     }
@@ -1313,14 +1313,14 @@ class RemoveNegationStage : public ArithmeticOptimizerStage {
         node->mutable_input()->SwapElements(0, 1);
         node->set_input(1, x->input(0));
         node->add_input(AsControlDependency(x->name()));
-        ctx_.node_map->AddOutput(NodeName(x->input(0)), node_name);
+        ctx().node_map->AddOutput(NodeName(x->input(0)), node_name);
         updated = true;
       } else if (IsNeg(*y)) {
         // a + (-b) = a - b
         node->set_op("Sub");
         node->set_input(1, y->input(0));
         node->add_input(AsControlDependency(y->name()));
-        ctx_.node_map->AddOutput(NodeName(y->input(0)), node_name);
+        ctx().node_map->AddOutput(NodeName(y->input(0)), node_name);
         updated = true;
       }
     } else if (IsSub(*node)) {
@@ -1329,7 +1329,7 @@ class RemoveNegationStage : public ArithmeticOptimizerStage {
         node->set_op("Add");
         node->set_input(1, y->input(0));
         node->add_input(AsControlDependency(y->name()));
-        ctx_.node_map->AddOutput(NodeName(y->input(0)), node_name);
+        ctx().node_map->AddOutput(NodeName(y->input(0)), node_name);
         updated = true;
       }
     }
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
index ed398525f3..089cad36e9 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -182,7 +182,10 @@ class GraphOptimizerStage {
     return ::tensorflow::grappler::AddEmptyNode(ctx_, name);
   }
 
- protected:  // Data members
+ protected:
+  const GraphOptimizerContext& ctx() const { return ctx_; }
+
+ private:  // Data members
   const string optimizer_name_;
   const string stage_name_;
   const GraphOptimizerContext ctx_;
-- 
GitLab


From 3fa8795c511931b55a9703956bdf564fde817c2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Branchaud-Charron?=
 <frederic.branchaud-charron@usherbrooke.ca>
Date: Fri, 20 Apr 2018 19:10:41 -0400
Subject: [PATCH 259/450] Fix casting in Keras estimator (#18104)

---
 .../python/keras/_impl/keras/estimator.py     | 22 +++++++++++++----
 .../keras/_impl/keras/estimator_test.py       | 24 +++++++++++++++----
 2 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/estimator.py b/tensorflow/python/keras/_impl/keras/estimator.py
index b922a6c683..c3c3fceb45 100644
--- a/tensorflow/python/keras/_impl/keras/estimator.py
+++ b/tensorflow/python/keras/_impl/keras/estimator.py
@@ -29,12 +29,14 @@ from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import models
 from tensorflow.python.keras._impl.keras import optimizers
 from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
 from tensorflow.python.keras._impl.keras.engine.network import Network
 from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_module
 from tensorflow.python.ops import variables as variables_module
@@ -55,6 +57,17 @@ def _cast_tensor_to_floatx(x):
     return math_ops.cast(x, K.floatx())
 
 
+def _convert_tensor(x):
+  """Create or cast tensor if needed."""
+  if not tensor_util.is_tensor(x):
+    # x is a numpy array
+    x = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(x)
+  if check_ops.is_numeric_tensor(x):
+    # is_numeric_tensor returns False if provided with a numpy array
+    x = _cast_tensor_to_floatx(x)
+  return x
+
+
 def _any_variable_initalized():
   """Check if any variable has been initialized in the Keras model.
 
@@ -86,7 +99,7 @@ def _create_ordered_io(keras_model, estimator_io, is_input=True):
   if isinstance(estimator_io, (list, tuple)):
     # Case currently not supported by most built-in input_fn,
     # but it's good to have for sanity
-    return [_cast_tensor_to_floatx(x) for x in estimator_io]
+    return [_convert_tensor(x) for x in estimator_io]
   elif isinstance(estimator_io, dict):
     if is_input:
       if keras_model._is_graph_network:
@@ -108,12 +121,12 @@ def _create_ordered_io(keras_model, estimator_io, is_input=True):
             'It needs to match one '
             'of the following: %s' % ('input' if is_input else 'output', key,
                                       ', '.join(keras_io_names)))
-      tensors = [_cast_tensor_to_floatx(estimator_io[io_name])
+      tensors = [_convert_tensor(estimator_io[io_name])
                  for io_name in keras_io_names]
     return tensors
   else:
     # Plain array.
-    return _cast_tensor_to_floatx(estimator_io)
+    return _convert_tensor(estimator_io)
 
 
 def _in_place_subclassed_model_reset(model):
@@ -274,8 +287,7 @@ def _clone_and_build_model(mode,
                                         is_input=False)
   else:
     target_tensors = [
-        _cast_tensor_to_floatx(
-            sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(labels))
+        _convert_tensor(labels)
     ]
 
   if keras_model._is_graph_network:
diff --git a/tensorflow/python/keras/_impl/keras/estimator_test.py b/tensorflow/python/keras/_impl/keras/estimator_test.py
index 653cdc01e2..80fa87d041 100644
--- a/tensorflow/python/keras/_impl/keras/estimator_test.py
+++ b/tensorflow/python/keras/_impl/keras/estimator_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl import keras
+from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.keras._impl.keras.applications import mobilenet
 from tensorflow.python.keras._impl.keras.optimizers import SGD
@@ -142,16 +143,20 @@ def randomize_io_type(array, name):
 
 
 def multi_inputs_multi_outputs_model():
-  # test multi-input layer
   a = keras.layers.Input(shape=(16,), name='input_a')
   b = keras.layers.Input(shape=(16,), name='input_b')
+  m = keras.layers.Input(shape=(8,), dtype='bool', name='input_m')
   dense = keras.layers.Dense(8, name='dense_1')
+
   a_2 = dense(a)
+  # Apply a mask
+  s_2 = keras.layers.Lambda(lambda k:
+                            K.switch(k[0], k[1], K.zeros_like(k[1])))([m, a_2])
   b_2 = dense(b)
-  merged = keras.layers.concatenate([a_2, b_2], name='merge')
+  merged = keras.layers.concatenate([s_2, b_2], name='merge')
   c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged)
   d = keras.layers.Dense(2, activation='softmax', name='dense_3')(merged)
-  model = keras.models.Model(inputs=[a, b], outputs=[c, d])
+  model = keras.models.Model(inputs=[a, b, m], outputs=[c, d])
   model.compile(
       loss='categorical_crossentropy',
       optimizer='rmsprop',
@@ -352,18 +357,27 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
         test_samples=50,
         input_shape=(16,),
         num_classes=2)
+    np.random.seed(_RANDOM_SEED)
+    (input_m_train, _), (input_m_test, _) = testing_utils.get_test_data(
+        train_samples=_TRAIN_SIZE,
+        test_samples=50,
+        input_shape=(8,),
+        num_classes=2)
+
     c_train = keras.utils.to_categorical(c_train)
     c_test = keras.utils.to_categorical(c_test)
     d_train = keras.utils.to_categorical(d_train)
     d_test = keras.utils.to_categorical(d_test)
 
     def train_input_fn():
-      input_dict = {'input_a': a_train, 'input_b': b_train}
+      input_dict = {'input_a': a_train, 'input_b': b_train,
+                    'input_m': input_m_train > 0}
       output_dict = {'dense_2': c_train, 'dense_3': d_train}
       return input_dict, output_dict
 
     def eval_input_fn():
-      input_dict = {'input_a': a_test, 'input_b': b_test}
+      input_dict = {'input_a': a_test, 'input_b': b_test,
+                    'input_m': input_m_test > 0}
       output_dict = {'dense_2': c_test, 'dense_3': d_test}
       return input_dict, output_dict
 
-- 
GitLab


From cd095e0c455b3df98841ca70ba24fd41935552e7 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Fri, 20 Apr 2018 16:18:29 -0700
Subject: [PATCH 260/450] tf.contrib.data.scan: Support eager execution.

PiperOrigin-RevId: 193739234
---
 .../contrib/data/python/kernel_tests/BUILD    |  1 +
 .../kernel_tests/scan_dataset_op_test.py      | 23 ++++++++++++-------
 .../contrib/data/python/ops/scan_ops.py       |  1 +
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 05a4f5028a..9d1e8b20c2 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -343,6 +343,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py
index e0494736b7..1a97a84b2c 100644
--- a/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py
@@ -24,9 +24,11 @@ import numpy as np
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import scan_ops
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -57,19 +59,24 @@ class ScanDatasetTest(test.TestCase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(next_element)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testFibonacci(self):
     iterator = dataset_ops.Dataset.from_tensors(1).repeat(None).apply(
         scan_ops.scan([0, 1], lambda a, _: ([a[1], a[0] + a[1]], a[1]))
     ).make_one_shot_iterator()
-    next_element = iterator.get_next()
 
-    with self.test_session() as sess:
-      self.assertEqual(1, sess.run(next_element))
-      self.assertEqual(1, sess.run(next_element))
-      self.assertEqual(2, sess.run(next_element))
-      self.assertEqual(3, sess.run(next_element))
-      self.assertEqual(5, sess.run(next_element))
-      self.assertEqual(8, sess.run(next_element))
+    if context.executing_eagerly():
+      next_element = iterator.get_next
+    else:
+      get_next = iterator.get_next()
+      next_element = lambda: get_next
+
+    self.assertEqual(1, self.evaluate(next_element()))
+    self.assertEqual(1, self.evaluate(next_element()))
+    self.assertEqual(2, self.evaluate(next_element()))
+    self.assertEqual(3, self.evaluate(next_element()))
+    self.assertEqual(5, self.evaluate(next_element()))
+    self.assertEqual(8, self.evaluate(next_element()))
 
   def testChangingStateShape(self):
     # Test the fixed-point shape invariant calculations: start with
diff --git a/tensorflow/contrib/data/python/ops/scan_ops.py b/tensorflow/contrib/data/python/ops/scan_ops.py
index 1c88366273..711a538697 100644
--- a/tensorflow/contrib/data/python/ops/scan_ops.py
+++ b/tensorflow/contrib/data/python/ops/scan_ops.py
@@ -144,6 +144,7 @@ class _ScanDataset(dataset_ops.Dataset):
                                                    weakened_state_shapes)
 
     self._scan_func = tf_scan_func
+    self._scan_func.add_to_graph(ops.get_default_graph())
 
   def _as_variant_tensor(self):
     input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
-- 
GitLab


From 8d3a41f459b776856ff668bb076d4bc449927e09 Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Fri, 20 Apr 2018 16:30:02 -0700
Subject: [PATCH 261/450] [XLA] Remove constant cast in literal util.

It's not portable to modify an underlying char array of a c++ string object: (https://stackoverflow.com/questions/5729203/modifying-underlying-char-array-of-a-c-string-object)

RELNOTES: n/a
PiperOrigin-RevId: 193740595
---
 tensorflow/compiler/xla/literal_util.cc | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index c315b4ff30..bb6dd4f909 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -44,8 +44,16 @@ namespace {
 
 constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
 
-// Converts between little and big endian, assuming elements in the array are 16
-// bits long.
+// Converts between little and big endian.
+//
+// Precondition: size % 2 == 0 (elements in the array are 16 bits long)
+void ConvertEndianShort(string* bytes) {
+  CHECK_EQ(bytes->size() / 2, 0);
+  for (int64 i = 0; i < bytes->size(); i += 2) {
+    std::swap((*bytes)[i], (*bytes)[i + 1]);
+  }
+}
+
 void ConvertEndianShort(char* bytes, int64 size) {
   CHECK_EQ(size / 2, 0);
   for (int64 i = 0; i < size; i += 2) {
@@ -1930,16 +1938,14 @@ void Literal::Piece::WriteToProto(LiteralProto* proto) const {
       *proto->mutable_f16s() = string(
           reinterpret_cast<const char*>(data<half>().data()), size_bytes());
       if (!kLittleEndian) {
-        ConvertEndianShort(const_cast<char*>(proto->mutable_f16s()->data()),
-                           proto->f16s().size());
+        ConvertEndianShort(proto->mutable_f16s());
       }
       break;
     case BF16:
       *proto->mutable_bf16s() = string(
           reinterpret_cast<const char*>(data<bfloat16>().data()), size_bytes());
       if (!kLittleEndian) {
-        ConvertEndianShort(const_cast<char*>(proto->mutable_bf16s()->data()),
-                           proto->bf16s().size());
+        ConvertEndianShort(proto->mutable_bf16s());
       }
       break;
     case F32:
-- 
GitLab


From 16f0a5bb2aed8d0e605004b421a9cd6f32e37f94 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Fri, 20 Apr 2018 16:48:44 -0700
Subject: [PATCH 262/450] Java: Bump release to 1.8.0-rc1

PiperOrigin-RevId: 193742798
---
 tensorflow/java/maven/libtensorflow/pom.xml         | 2 +-
 tensorflow/java/maven/libtensorflow_jni/pom.xml     | 2 +-
 tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml | 2 +-
 tensorflow/java/maven/pom.xml                       | 2 +-
 tensorflow/java/maven/proto/pom.xml                 | 2 +-
 tensorflow/java/maven/tensorflow/pom.xml            | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index 9c1601753b..66985e3b18 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.8.0-rc0</version>
+    <version>1.8.0-rc1</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index 3d013e12b0..34d4ba0b08 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.8.0-rc0</version>
+    <version>1.8.0-rc1</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index 40e44af1f5..1909d08e41 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.8.0-rc0</version>
+    <version>1.8.0-rc1</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 82bfd0c73a..ba98732f5a 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.8.0-rc0</version>
+  <version>1.8.0-rc1</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index 0a2775a500..dee8c34359 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.8.0-rc0</version>
+    <version>1.8.0-rc1</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index 61961432a7..95e024ace9 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.8.0-rc0</version>
+    <version>1.8.0-rc1</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
-- 
GitLab


From 0385bfe0726ad9710bfcca145e19611e9e2391bb Mon Sep 17 00:00:00 2001
From: Mustafa Ispir <ispir@google.com>
Date: Fri, 20 Apr 2018 17:03:14 -0700
Subject: [PATCH 263/450] Let estimators to be used when eager is enabled.

PiperOrigin-RevId: 193744371
---
 tensorflow/python/estimator/estimator.py      | 283 +++++++++---------
 tensorflow/python/estimator/estimator_test.py |   1 +
 2 files changed, 143 insertions(+), 141 deletions(-)

diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 9862fdecdb..351fcb6423 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -100,10 +100,6 @@ class Estimator(object):
   None of `Estimator`'s methods can be overridden in subclasses (its
   constructor enforces this). Subclasses should use `model_fn` to configure
   the base class, and may add methods implementing specialized functionality.
-
-  @compatibility(eager)
-  Estimators are not compatible with eager execution.
-  @end_compatibility
   """
 
   def __init__(self, model_fn, model_dir=None, config=None, params=None,
@@ -166,15 +162,10 @@ class Estimator(object):
                        vocabularies and Tensor names are unchanged.
 
     Raises:
-      RuntimeError: If eager execution is enabled.
       ValueError: parameters of `model_fn` don't match `params`.
       ValueError: if this is called via a subclass and if that class overrides
         a member of `Estimator`.
     """
-    if context.executing_eagerly():
-      raise RuntimeError(
-          'Estimators are not supported when eager execution is enabled.')
-
     Estimator._assert_members_are_not_overridden(self)
 
     if config is None:
@@ -269,7 +260,8 @@ class Estimator(object):
       ValueError: If the Estimator has not produced a checkpoint yet.
     """
     _check_checkpoint_available(self.model_dir)
-    return training.load_variable(self.model_dir, name)
+    with context.graph_mode():
+      return training.load_variable(self.model_dir, name)
 
   def get_variable_names(self):
     """Returns list of all variable names in this model.
@@ -281,7 +273,8 @@ class Estimator(object):
       ValueError: If the Estimator has not produced a checkpoint yet.
     """
     _check_checkpoint_available(self.model_dir)
-    return [name for name, _ in training.list_variables(self.model_dir)]
+    with context.graph_mode():
+      return [name for name, _ in training.list_variables(self.model_dir)]
 
   def latest_checkpoint(self):
     """Finds the filename of latest saved checkpoint file in `model_dir`.
@@ -290,7 +283,8 @@ class Estimator(object):
       The full path to the latest checkpoint or `None` if no checkpoint was
       found.
     """
-    return saver.latest_checkpoint(self.model_dir)
+    with context.graph_mode():
+      return saver.latest_checkpoint(self.model_dir)
 
   def train(self,
             input_fn,
@@ -342,27 +336,28 @@ class Estimator(object):
       ValueError: If both `steps` and `max_steps` are not `None`.
       ValueError: If either `steps` or `max_steps` is <= 0.
     """
-    if (steps is not None) and (max_steps is not None):
-      raise ValueError('Can not provide both steps and max_steps.')
-    if steps is not None and steps <= 0:
-      raise ValueError('Must specify steps > 0, given: {}'.format(steps))
-    if max_steps is not None and max_steps <= 0:
-      raise ValueError(
-          'Must specify max_steps > 0, given: {}'.format(max_steps))
+    with context.graph_mode():
+      if (steps is not None) and (max_steps is not None):
+        raise ValueError('Can not provide both steps and max_steps.')
+      if steps is not None and steps <= 0:
+        raise ValueError('Must specify steps > 0, given: {}'.format(steps))
+      if max_steps is not None and max_steps <= 0:
+        raise ValueError(
+            'Must specify max_steps > 0, given: {}'.format(max_steps))
 
-    if max_steps is not None:
-      start_step = _load_global_step_from_checkpoint_dir(self._model_dir)
-      if max_steps <= start_step:
-        logging.info('Skipping training since max_steps has already saved.')
-        return self
+      if max_steps is not None:
+        start_step = _load_global_step_from_checkpoint_dir(self._model_dir)
+        if max_steps <= start_step:
+          logging.info('Skipping training since max_steps has already saved.')
+          return self
 
-    hooks = _check_hooks_type(hooks)
-    hooks.extend(self._convert_train_steps_to_hooks(steps, max_steps))
+      hooks = _check_hooks_type(hooks)
+      hooks.extend(self._convert_train_steps_to_hooks(steps, max_steps))
 
-    saving_listeners = _check_listeners_type(saving_listeners)
-    loss = self._train_model(input_fn, hooks, saving_listeners)
-    logging.info('Loss for final step: %s.', loss)
-    return self
+      saving_listeners = _check_listeners_type(saving_listeners)
+      loss = self._train_model(input_fn, hooks, saving_listeners)
+      logging.info('Loss for final step: %s.', loss)
+      return self
 
   def _convert_train_steps_to_hooks(self, steps, max_steps):
     if steps is not None or max_steps is not None:
@@ -415,14 +410,15 @@ class Estimator(object):
       ValueError: If no model has been trained, namely `model_dir`, or the
         given `checkpoint_path` is empty.
     """
-    hooks = _check_hooks_type(hooks)
-    hooks.extend(self._convert_eval_steps_to_hooks(steps))
+    with context.graph_mode():
+      hooks = _check_hooks_type(hooks)
+      hooks.extend(self._convert_eval_steps_to_hooks(steps))
 
-    return self._evaluate_model(
-        input_fn=input_fn,
-        hooks=hooks,
-        checkpoint_path=checkpoint_path,
-        name=name)
+      return self._evaluate_model(
+          input_fn=input_fn,
+          hooks=hooks,
+          checkpoint_path=checkpoint_path,
+          name=name)
 
   def _convert_eval_steps_to_hooks(self, steps):
     if steps is None:
@@ -479,45 +475,48 @@ class Estimator(object):
         `predictions`. For example if `predict_keys` is not `None` but
         `EstimatorSpec.predictions` is not a `dict`.
     """
-    hooks = _check_hooks_type(hooks)
-    # Check that model has been trained.
-    if not checkpoint_path:
-      checkpoint_path = saver.latest_checkpoint(self._model_dir)
-    if not checkpoint_path:
-      raise ValueError('Could not find trained model in model_dir: {}.'.format(
-          self._model_dir))
+    with context.graph_mode():
+      hooks = _check_hooks_type(hooks)
+      # Check that model has been trained.
+      if not checkpoint_path:
+        checkpoint_path = saver.latest_checkpoint(self._model_dir)
+      if not checkpoint_path:
+        raise ValueError(
+            'Could not find trained model in model_dir: {}.'.format(
+                self._model_dir))
 
-    with ops.Graph().as_default() as g:
-      random_seed.set_random_seed(self._config.tf_random_seed)
-      self._create_and_assert_global_step(g)
-      features, input_hooks = self._get_features_from_input_fn(
-          input_fn, model_fn_lib.ModeKeys.PREDICT)
-      estimator_spec = self._call_model_fn(
-          features, None, model_fn_lib.ModeKeys.PREDICT, self.config)
-      predictions = self._extract_keys(estimator_spec.predictions, predict_keys)
-      all_hooks = list(input_hooks)
-      all_hooks.extend(hooks)
-      all_hooks.extend(list(estimator_spec.prediction_hooks or []))
-      with training.MonitoredSession(
-          session_creator=training.ChiefSessionCreator(
-              checkpoint_filename_with_path=checkpoint_path,
-              master=self._config.master,
-              scaffold=estimator_spec.scaffold,
-              config=self._session_config),
-          hooks=all_hooks) as mon_sess:
-        while not mon_sess.should_stop():
-          preds_evaluated = mon_sess.run(predictions)
-          if not yield_single_examples:
-            yield preds_evaluated
-          elif not isinstance(predictions, dict):
-            for pred in preds_evaluated:
-              yield pred
-          else:
-            for i in range(self._extract_batch_length(preds_evaluated)):
-              yield {
-                  key: value[i]
-                  for key, value in six.iteritems(preds_evaluated)
-              }
+      with ops.Graph().as_default() as g:
+        random_seed.set_random_seed(self._config.tf_random_seed)
+        self._create_and_assert_global_step(g)
+        features, input_hooks = self._get_features_from_input_fn(
+            input_fn, model_fn_lib.ModeKeys.PREDICT)
+        estimator_spec = self._call_model_fn(
+            features, None, model_fn_lib.ModeKeys.PREDICT, self.config)
+        predictions = self._extract_keys(
+            estimator_spec.predictions, predict_keys)
+        all_hooks = list(input_hooks)
+        all_hooks.extend(hooks)
+        all_hooks.extend(list(estimator_spec.prediction_hooks or []))
+        with training.MonitoredSession(
+            session_creator=training.ChiefSessionCreator(
+                checkpoint_filename_with_path=checkpoint_path,
+                master=self._config.master,
+                scaffold=estimator_spec.scaffold,
+                config=self._session_config),
+            hooks=all_hooks) as mon_sess:
+          while not mon_sess.should_stop():
+            preds_evaluated = mon_sess.run(predictions)
+            if not yield_single_examples:
+              yield preds_evaluated
+            elif not isinstance(predictions, dict):
+              for pred in preds_evaluated:
+                yield pred
+            else:
+              for i in range(self._extract_batch_length(preds_evaluated)):
+                yield {
+                    key: value[i]
+                    for key, value in six.iteritems(preds_evaluated)
+                }
 
   def _assert_members_are_not_overridden(self):
     """Asserts members of `Estimator` are not overridden."""
@@ -597,73 +596,75 @@ class Estimator(object):
           are provided, or no checkpoint can be found.
     """
     # pylint: enable=line-too-long
-    if serving_input_receiver_fn is None:
-      raise ValueError('serving_input_receiver_fn must be defined.')
-
-    with ops.Graph().as_default() as g:
-      self._create_and_assert_global_step(g)
-      random_seed.set_random_seed(self._config.tf_random_seed)
-      serving_input_receiver = serving_input_receiver_fn()
+    with context.graph_mode():
+      if serving_input_receiver_fn is None:
+        raise ValueError('serving_input_receiver_fn must be defined.')
 
-      # Call the model_fn and collect the export_outputs.
-      estimator_spec = self._call_model_fn(
-          features=serving_input_receiver.features,
-          labels=None,
-          mode=model_fn_lib.ModeKeys.PREDICT,
-          config=self.config)
-
-      # Build the SignatureDefs from receivers and all outputs
-      signature_def_map = build_all_signature_defs(
-          serving_input_receiver.receiver_tensors,
-          estimator_spec.export_outputs,
-          serving_input_receiver.receiver_tensors_alternatives)
-
-      if not checkpoint_path:
-        # Locate the latest checkpoint
-        checkpoint_path = saver.latest_checkpoint(self._model_dir)
-      if not checkpoint_path:
-        raise ValueError("Couldn't find trained model at %s." % self._model_dir)
-
-      export_dir = get_timestamped_export_dir(export_dir_base)
-      temp_export_dir = get_temp_export_dir(export_dir)
-
-      # TODO(soergel): Consider whether MonitoredSession makes sense here
-      with tf_session.Session(config=self._session_config) as session:
-
-        saver_for_restore = estimator_spec.scaffold.saver or saver.Saver(
-            sharded=True)
-        saver_for_restore.restore(session, checkpoint_path)
-
-        # pylint: disable=protected-access
-        local_init_op = (
-            estimator_spec.scaffold.local_init_op or
-            monitored_session.Scaffold._default_local_init_op())
-        # pylint: enable=protected-access
-
-        # Perform the export
-        builder = saved_model_builder.SavedModelBuilder(temp_export_dir)
-        builder.add_meta_graph_and_variables(
-            session, [tag_constants.SERVING],
-            signature_def_map=signature_def_map,
-            assets_collection=ops.get_collection(
-                ops.GraphKeys.ASSET_FILEPATHS),
-            legacy_init_op=local_init_op,
-            strip_default_attrs=strip_default_attrs)
-        builder.save(as_text)
-
-      # Add the extra assets
-      if assets_extra:
-        assets_extra_path = os.path.join(compat.as_bytes(temp_export_dir),
-                                         compat.as_bytes('assets.extra'))
-        for dest_relative, source in assets_extra.items():
-          dest_absolute = os.path.join(compat.as_bytes(assets_extra_path),
-                                       compat.as_bytes(dest_relative))
-          dest_path = os.path.dirname(dest_absolute)
-          gfile.MakeDirs(dest_path)
-          gfile.Copy(source, dest_absolute)
-
-      gfile.Rename(temp_export_dir, export_dir)
-      return export_dir
+      with ops.Graph().as_default() as g:
+        self._create_and_assert_global_step(g)
+        random_seed.set_random_seed(self._config.tf_random_seed)
+        serving_input_receiver = serving_input_receiver_fn()
+
+        # Call the model_fn and collect the export_outputs.
+        estimator_spec = self._call_model_fn(
+            features=serving_input_receiver.features,
+            labels=None,
+            mode=model_fn_lib.ModeKeys.PREDICT,
+            config=self.config)
+
+        # Build the SignatureDefs from receivers and all outputs
+        signature_def_map = build_all_signature_defs(
+            serving_input_receiver.receiver_tensors,
+            estimator_spec.export_outputs,
+            serving_input_receiver.receiver_tensors_alternatives)
+
+        if not checkpoint_path:
+          # Locate the latest checkpoint
+          checkpoint_path = saver.latest_checkpoint(self._model_dir)
+        if not checkpoint_path:
+          raise ValueError(
+              "Couldn't find trained model at %s." % self._model_dir)
+
+        export_dir = get_timestamped_export_dir(export_dir_base)
+        temp_export_dir = get_temp_export_dir(export_dir)
+
+        # TODO(soergel): Consider whether MonitoredSession makes sense here
+        with tf_session.Session(config=self._session_config) as session:
+
+          saver_for_restore = estimator_spec.scaffold.saver or saver.Saver(
+              sharded=True)
+          saver_for_restore.restore(session, checkpoint_path)
+
+          # pylint: disable=protected-access
+          local_init_op = (
+              estimator_spec.scaffold.local_init_op or
+              monitored_session.Scaffold._default_local_init_op())
+          # pylint: enable=protected-access
+
+          # Perform the export
+          builder = saved_model_builder.SavedModelBuilder(temp_export_dir)
+          builder.add_meta_graph_and_variables(
+              session, [tag_constants.SERVING],
+              signature_def_map=signature_def_map,
+              assets_collection=ops.get_collection(
+                  ops.GraphKeys.ASSET_FILEPATHS),
+              legacy_init_op=local_init_op,
+              strip_default_attrs=strip_default_attrs)
+          builder.save(as_text)
+
+        # Add the extra assets
+        if assets_extra:
+          assets_extra_path = os.path.join(compat.as_bytes(temp_export_dir),
+                                           compat.as_bytes('assets.extra'))
+          for dest_relative, source in assets_extra.items():
+            dest_absolute = os.path.join(compat.as_bytes(assets_extra_path),
+                                         compat.as_bytes(dest_relative))
+            dest_path = os.path.dirname(dest_absolute)
+            gfile.MakeDirs(dest_path)
+            gfile.Copy(source, dest_absolute)
+
+        gfile.Rename(temp_export_dir, export_dir)
+        return export_dir
 
   def _get_features_from_input_fn(self, input_fn, mode):
     """Extracts the `features` from return values of `input_fn`."""
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index f4255091bf..d453e19357 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -2287,6 +2287,7 @@ class EstimatorHookOrderingTest(test.TestCase):
 
 class EstimatorIntegrationTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_complete_flow_with_a_simple_linear_model(self):
 
     def _model_fn(features, labels, mode):
-- 
GitLab


From 2591a66ab804b73f55c1c7a0b105744f94d8a02e Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Fri, 20 Apr 2018 17:55:01 -0700
Subject: [PATCH 264/450] Automated g4 rollback of changelist 193717076

PiperOrigin-RevId: 193749007
---
 tensorflow/contrib/tpu/BUILD                  |   1 +
 .../contrib/tpu/python/tpu/keras_support.py   | 391 ++++++++++++++++++
 2 files changed, 392 insertions(+)
 create mode 100644 tensorflow/contrib/tpu/python/tpu/keras_support.py

diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 9646d15486..eac210418b 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -162,6 +162,7 @@ py_library(
         "python/tpu/__init__.py",
         "python/tpu/bfloat16.py",
         "python/tpu/device_assignment.py",
+        "python/tpu/keras_support.py",
         "python/tpu/topology.py",
         "python/tpu/tpu.py",
         "python/tpu/tpu_feed.py",
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
new file mode 100644
index 0000000000..e86ca0a1d8
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -0,0 +1,391 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""*Experimental* support for running Keras models on the TPU.
+
+To use, wrap your model with the `keras_support.tpu_model` function.
+
+Example usage:
+
+```
+# Must activate before building TPU models
+keras_support.setup_tpu_session(master_address)
+
+image = tf.keras.layers.Input(shape=(28, 28, 3), name='image')
+c1 = tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3))( image)
+flattened = tf.keras.layers.Flatten()(c1)
+logits = tf.keras.layers.Dense(10, activation='softmax')(flattened)
+model = tf.keras.Model(inputs=[image], outputs=[logits])
+model = keras_support.tpu_model(model)
+
+# Only TF optimizers are currently supported.
+model.compile(optimizer=tf.train.AdamOptimizer(), ...)
+
+# `images` and `labels` should be Numpy arrays.  Support for tensor input
+# (e.g. datasets) is planned.
+model.fit(images, labels)
+
+# Invoke before shutting down
+keras_support.shutdown_tpu_session()
+```
+"""
+
+# pylint: disable=protected-access
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+
+from tensorflow.contrib.framework.python.framework import experimental
+from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.contrib.tpu.python.tpu import tpu
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session as tf_session
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import layers
+from tensorflow.python.keras._impl.keras import models
+from tensorflow.python.keras._impl.keras import optimizers as keras_optimizers
+from tensorflow.python.keras._impl.keras.layers import embeddings
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import training_util
+
+
+class TPUEmbedding(embeddings.Embedding):
+  """TPU compatible embedding layer.
+
+  The default Keras layer is not TPU compatible.  This layer is a drop-in
+  replacement: it has the same behavior and will work on CPU and GPU devices.
+  """
+
+  def __init__(self, *args, **kw):
+    super(TPUEmbedding, self).__init__(*args, **kw)
+
+  def build(self, input_shape):
+    if input_shape[0] is None:
+      raise ValueError(
+          'TPUEmbeddings must have a fixed input_length or input shape.')
+    return super(TPUEmbedding, self).build(input_shape)
+
+  def call(self, inputs):
+    if K.dtype(inputs) != 'int32':
+      inputs = math_ops.cast(inputs, 'int32')
+
+    inputs = array_ops.one_hot(inputs, self.input_dim)
+    return math_ops.tensordot(inputs, self.embeddings, 1)
+
+
+class CompiledTPUOp(
+    collections.namedtuple(
+        'CompiledTPUOp',
+        ['tpu_execute_op', 'infeed_tensors', 'infeed_op', 'outfeed_op'])):
+  pass
+
+
+def _valid_name(tensor_name):
+  """Return a valid tensor name (strips '/', ':', etc)."""
+  return re.sub('[^a-zA-Z0-9_-]+', '', tensor_name)
+
+
+class TPUFunction(object):
+  """K.function compatible interface for invoking a TPU compiled function.
+
+  Recompilation is triggered on-demand for each set of new inputs shapes: the
+  results are cached for future execution.  We expect most computations will
+  be dominated by a standard batch-size, followed by a straggler batch for
+  the end of training or evaluation.
+
+  All `inputs` and `outputs` will be loaded via the infeed and outfeed queues
+  instead of being injected as `feed_dict` items or fetches.
+  """
+
+  def __init__(self, model, execution_mode):
+    self.model = model
+    self.execution_mode = execution_mode
+    self._compilation_cache = {}
+
+  def _specialize_model(self, input_specs):
+    """Specialize `self.model` (a Keras model) for the given input shapes."""
+    # Re-create our input and output layers inside our subgraph.  They will be
+    # attached to the true computation when we clone our model in `tpu_fn`.
+    K.set_learning_phase(self.execution_mode == model_fn_lib.ModeKeys.TRAIN)
+
+    # functools.partial and callable objects are not supported by tpu.rewrite
+    def _model_fn():
+      """Compute fit/eval/predict for the TPU."""
+      is_training = self.execution_mode == model_fn_lib.ModeKeys.TRAIN
+      is_test = self.execution_mode == model_fn_lib.ModeKeys.EVAL
+      is_predict = self.execution_mode == model_fn_lib.ModeKeys.PREDICT
+
+      # During train/eval, we infeed our features as well as labels.
+      if is_training or is_test:
+        infeed_layers = self.model._input_layers + self.model._output_layers
+      else:
+        infeed_layers = self.model._input_layers
+
+      # Generate our infeed operation to read features & labels.
+      infeed_tensors = tpu_ops.infeed_dequeue_tuple(
+          dtypes=[spec.dtype for spec in input_specs],
+          shapes=[spec.shape for spec in input_specs],
+          name='infeed-%s' % self.execution_mode)
+
+      assert len(infeed_tensors) == len(infeed_layers), (
+          'Infeed inputs did not match model: %s vs %s', (infeed_layers,
+                                                          infeed_tensors))
+
+      tpu_targets = []
+      tpu_inputs = []
+
+      # Sort infeed outputs into inputs and labels for calling our Keras model.
+      for tensor, layer in zip(infeed_tensors, infeed_layers):
+        if layer in self.model._input_layers:
+          tpu_inputs.append(layers.Input(name=layer.name, tensor=tensor))
+        if layer in self.model._output_layers:
+          tpu_targets.append(tensor)
+
+      optimizer = self.model.optimizer
+      optimizer.iterations = training_util.get_or_create_global_step()
+
+      # Call our model with our infeed inputs (re-using the weights).
+      model_outputs = self.model(tpu_inputs)
+      child_model = models.Model(inputs=tpu_inputs, outputs=model_outputs)
+      if is_training or is_test:
+        child_model.compile(
+            optimizer=self.model.optimizer,
+            loss=self.model.loss,
+            loss_weights=self.model.loss_weights,
+            metrics=self.model.metrics,
+            weighted_metrics=self.model.weighted_metrics,
+            target_tensors=tpu_targets,
+        )
+
+      # Compute our outfeed depending on the execution mode
+      if is_training:
+        child_model._make_train_function()
+        self._outfeed_spec = [
+            tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name)
+            for tensor in child_model.train_function.outputs
+        ]
+        return [
+            child_model.train_function.updates_op,
+            tpu_ops.outfeed_enqueue_tuple(
+                child_model.train_function.outputs, name='oufeed-enqueue-train')
+        ]
+      elif is_test:
+        child_model._make_test_function()
+        self._outfeed_spec = [
+            tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name)
+            for tensor in child_model.test_function.outputs
+        ]
+        return [
+            tpu_ops.outfeed_enqueue_tuple(
+                child_model.test_function.outputs, name='outfeed-enqueue-test')
+        ]
+      elif is_predict:
+        child_model._make_predict_function()
+        self._outfeed_spec = [
+            tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name)
+            for tensor in child_model.predict_function.outputs
+        ]
+        return [
+            tpu_ops.outfeed_enqueue_tuple(
+                child_model.predict_function.outputs,
+                name='outfeed-enqueue-predict',
+            )
+        ]
+      else:
+        assert False, 'Unexpected execution mode: %s' % self.execution_mode
+
+    # Capture outfeed metadata computed during the rewrite.
+    self._outfeed_spec = None
+
+    tpu_execute_op = tpu.rewrite(_model_fn)
+
+    K._initialize_variables(K.get_session())  # pylint-disable: protected-access
+
+    # Generate CPU side operations to enqueue features/labels and dequeue
+    # outputs from the model call.
+    with ops.device('/device:TPU:0'):
+      infeed_tensors = []
+      for spec in input_specs:
+        infeed_tensors.append(
+            array_ops.placeholder(
+                dtype=spec.dtype,
+                shape=spec.shape,
+                name='infeed-enqueue-%s' % spec.name))
+
+      infeed_op = tpu_ops.infeed_enqueue_tuple(
+          infeed_tensors, [spec.shape for spec in input_specs],
+          name='infeed-enqueue-%s' % self.execution_mode)
+
+      outfeed_op = tpu_ops.outfeed_dequeue_tuple(
+          dtypes=[spec.dtype for spec in self._outfeed_spec],
+          shapes=[spec.shape for spec in self._outfeed_spec],
+          name='outfeed-dequeue-%s' % self.execution_mode)
+
+    return CompiledTPUOp(tpu_execute_op, infeed_tensors, infeed_op, outfeed_op)
+
+  def __call__(self, inputs):
+    assert isinstance(inputs, list)
+
+    # Strip sample weight from inputs
+    if (self.execution_mode == model_fn_lib.ModeKeys.TRAIN or
+        self.execution_mode == model_fn_lib.ModeKeys.EVAL):
+      input_tensors = self.model._feed_inputs + self.model._feed_targets
+      inputs = inputs[:len(input_tensors)]
+    else:
+      input_tensors = self.model._feed_inputs
+
+    # Compute an input specification (used to generate infeed enqueue and
+    # dequeue operations).  We use the shape from our input array and the
+    # dtype from our model.  A user may pass in a float64 for a float32
+    # input: for model compatibility we still must generate a float32 infeed.
+    input_specs = []
+    for tensor, ary in zip(input_tensors, inputs):
+      input_specs.append(
+          tensor_spec.TensorSpec(ary.shape, tensor.dtype,
+                                 _valid_name(tensor.name)))
+
+    # XLA requires every operation in the graph has a fixed shape.  To
+    # handle varying batch sizes we recompile a new sub-graph for each
+    # unique input shape.
+    shape_key = tuple([tuple(spec.shape.as_list()) for spec in input_specs])
+
+    if shape_key not in self._compilation_cache:
+      logging.info('New input shapes; (re-)compiling: mode=%s, %s',
+                   self.execution_mode, input_specs)
+      self._compilation_cache[shape_key] = self._specialize_model(input_specs)
+
+    compiled_model = self._compilation_cache[shape_key]
+
+    infeed_dict = {}
+    for tensor, value in zip(compiled_model.infeed_tensors, inputs):
+      infeed_dict[tensor] = value
+
+    session = K.get_session()
+    _, _, outfeed_outputs = session.run([
+        compiled_model.infeed_op, compiled_model.tpu_execute_op,
+        compiled_model.outfeed_op
+    ], infeed_dict)
+
+    return outfeed_outputs
+
+
+@experimental
+def setup_tpu_session(master):
+  """Initializes and returns a Keras/TF session connected the TPU `master`."""
+  session = tf_session.Session(
+      target=master, config=config_pb2.ConfigProto(isolate_session_state=True))
+  K.set_session(session)
+  K.get_session().run(tpu.initialize_system())
+  K.manual_variable_initialization(True)
+  return session
+
+
+@experimental
+def shutdown_tpu_session(session=None):
+  """Shutdown the TPU attached to session.
+
+  This should be called to cleanly shut down the TPU system before the client
+  exits.
+
+  Args:
+    session: Session to shutdown, or None to use the default session.
+
+  Returns:
+
+  """
+  if session is None:
+    session = K.get_session()
+
+  session.run(tpu.shutdown_system())
+
+
+class KerasTPUModel(models.Model):
+  """TPU compatible Keras model wrapper."""
+
+  def __init__(self, inputs, outputs, name=None):
+    super(models.Model, self).__init__(
+        inputs=inputs,
+        outputs=outputs,
+        name=name,
+    )
+    self.predict_function = None
+    self.test_function = None
+    self.train_function = None
+
+  def compile(self,
+              optimizer,
+              loss=None,
+              metrics=None,
+              loss_weights=None,
+              sample_weight_mode=None,
+              weighted_metrics=None,
+              target_tensors=None,
+              **kwargs):
+    if sample_weight_mode:
+      raise ValueError('sample_weight_mode not supported for TPU execution.')
+    if weighted_metrics:
+      raise ValueError('weighted_metrics not supported for TPU execution.')
+    if target_tensors:
+      raise ValueError('target_tensors is not supported for TPU execution.')
+
+    super(KerasTPUModel, self).compile(optimizer, loss, metrics, loss_weights,
+                                       sample_weight_mode, weighted_metrics,
+                                       target_tensors, **kwargs)
+
+    # Keras optimizers are not compatible with TPU rewrite
+    if not isinstance(self.optimizer, keras_optimizers.TFOptimizer):
+      raise ValueError(
+          'Optimizer must be a TFOptimizer, got: %s' % self.optimizer)
+
+  def train_on_batch(self, x, y, sample_weight=None, class_weight=None):
+    return super(KerasTPUModel, self).train_on_batch(x, y, sample_weight,
+                                                     class_weight)
+
+  def _make_train_function(self):
+    if not self.train_function:
+      self.train_function = TPUFunction(self, model_fn_lib.ModeKeys.TRAIN)
+
+    return self.train_function
+
+  def _make_test_function(self):
+    if not self.test_function:
+      self.test_function = TPUFunction(self, model_fn_lib.ModeKeys.EVAL)
+    return self.test_function
+
+  def _make_predict_function(self):
+    if not self.predict_function:
+      self.predict_function = TPUFunction(self, model_fn_lib.ModeKeys.PREDICT)
+    return self.predict_function
+
+  def cpu_model(self):
+    return models.Model(
+        inputs=self.inputs,
+        outputs=self.outputs,
+        name=self.name,
+    )
+
+
+@experimental
+def tpu_model(model):
+  return KerasTPUModel(
+      inputs=model.inputs, outputs=model.outputs, name=model.name)
-- 
GitLab


From 7cf9b65492121961f98481fa06a0398698c6c0a3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 18:29:01 -0700
Subject: [PATCH 265/450] Automated g4 rollback of changelist 193605910

PiperOrigin-RevId: 193751624
---
 tensorflow/core/grappler/optimizers/BUILD     |   4 -
 .../grappler/optimizers/function_optimizer.cc | 126 +------
 .../grappler/optimizers/function_optimizer.h  |   6 +-
 .../optimizers/function_optimizer_test.cc     |  32 +-
 .../grappler/optimizers/meta_optimizer.cc     | 330 +++++++-----------
 .../core/grappler/optimizers/meta_optimizer.h |  33 +-
 .../optimizers/meta_optimizer_test.cc         | 172 +--------
 tensorflow/core/grappler/utils/functions.cc   |  12 +-
 tensorflow/core/grappler/utils/functions.h    |  40 +--
 .../core/grappler/utils/functions_test.cc     |   8 +-
 10 files changed, 196 insertions(+), 567 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 42c3580d40..3f573cda10 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -517,13 +517,11 @@ cc_library(
         ":loop_optimizer",
         ":memory_optimizer",
         ":model_pruner",
-        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/utils:colocation",
-        "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
@@ -540,11 +538,9 @@ tf_cuda_cc_test(
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
-        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 950933b933..d008a9719f 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -76,10 +75,12 @@ string UniqueSpecializedFunctionName(const FunctionDef& func,
 
 class FunctionOptimizerContext {
  public:
-  explicit FunctionOptimizerContext(RewriterConfig::Toggle opt_level,
-                                    const GrapplerItem& item)
-      : function_library_(OpRegistry::Global(), item.graph.library()) {
-    InitializeInlinedFunctions(opt_level, item);
+  explicit FunctionOptimizerContext(const GrapplerItem& item,
+                                    RewriterConfig::Toggle opt_level)
+      : opt_level_(opt_level),
+        function_library_(FunctionLibraryDefinition(OpRegistry::Global(),
+                                                    item.graph.library())) {
+    InitializeInlinedFunctions(item);
   }
 
   const FunctionLibraryDefinition& function_library() const {
@@ -100,9 +101,8 @@ class FunctionOptimizerContext {
   }
 
  private:
-  void InitializeInlinedFunctions(RewriterConfig::Toggle opt_level,
-                                  const GrapplerItem& item) {
-    bool aggressive = opt_level == RewriterConfig::AGGRESSIVE;
+  void InitializeInlinedFunctions(const GrapplerItem& item) {
+    bool aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
 
     for (const FunctionDef& func : item.graph.library().function()) {
       // Can't create IdentityN nodes with no input or output: skip these
@@ -120,6 +120,7 @@ class FunctionOptimizerContext {
     }
   }
 
+  RewriterConfig::Toggle opt_level_;
   FunctionLibraryDefinition function_library_;
   // Functions that can be inlined into optimized graph.
   std::unordered_map<string, const FunctionDef*> inlined_functions_;
@@ -127,93 +128,9 @@ class FunctionOptimizerContext {
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionOptimizerContext);
 };
 
-// Return trimmed FunctionDefLibrary with functions that are reachable from
-// the optimized graph.
-FunctionDefLibrary TrimFunctionLibrary(const FunctionLibraryDefinition& flib,
-                                       const GraphDef& optimized_graph) {
-  // Functions that are reachable from the optimized graph.
-  std::unordered_set<string> keep_funcs;
-
-  std::vector<const FunctionDef*> func_queue;
-  func_queue.reserve(flib.num_functions());
-
-  // Add registered and not already processed functions to the queue by name.
-  const auto add_to_func_queue = [&](const string& func_name) {
-    const FunctionDef* func = flib.Find(func_name);
-    if (func && keep_funcs.find(func_name) == keep_funcs.end()) {
-      func_queue.push_back(func);
-    }
-  };
-
-  // Find all the functions that are reachable from the given node.
-  const auto add_node_to_func_queue = [&](const NodeDef& node) {
-    // Node itself can be a call to the function.
-    add_to_func_queue(node.op());
-
-    // Or node can have an attribute referencing a function.
-    for (const auto& attr : node.attr()) {
-      const auto& attr_value = attr.second;
-
-      // 1. AttrValue.func
-      if (attr_value.has_func()) {
-        add_to_func_queue(attr_value.func().name());
-      }
-
-      // 2. AttrValue.ListValue.func
-      if (attr_value.has_list()) {
-        for (const auto& func : attr_value.list().func()) {
-          add_to_func_queue(func.name());
-        }
-      }
-    }
-  };
-
-  // Add all functions that are directly called from the optimized graph.
-  const auto& graph_nodes = optimized_graph.node();
-  std::for_each(graph_nodes.begin(), graph_nodes.end(), add_node_to_func_queue);
-
-  // Process all reachable functions.
-  while (!func_queue.empty()) {
-    const FunctionDef* func = func_queue.back();
-    func_queue.pop_back();
-
-    const string& func_name = func->signature().name();
-    keep_funcs.insert(func_name);
-
-    // Find all the functions that called from the function body.
-    const auto& func_body = func->node_def();
-    std::for_each(func_body.begin(), func_body.end(), add_node_to_func_queue);
-
-    // Check if the function has a registered gradient.
-    const string grad_func_name = flib.FindGradient(func_name);
-    if (!grad_func_name.empty()) add_to_func_queue(grad_func_name);
-  }
-
-  FunctionDefLibrary lib;
-  for (const string& func_name : keep_funcs) {
-    const FunctionDef* func = CHECK_NOTNULL(flib.Find(func_name));
-    *lib.add_function() = *func;
-
-    const string grad_func_name = flib.FindGradient(func_name);
-    if (!grad_func_name.empty()) {
-      GradientDef* gd = lib.add_gradient();
-      gd->set_function_name(func_name);
-      gd->set_gradient_func(grad_func_name);
-    }
-  }
-
-  VLOG(3) << "Trimmed function library: " << keep_funcs.size() << " functions ("
-          << static_cast<int>(keep_funcs.size() - flib.num_functions()) << ")";
-
-  return lib;
-}
-
 Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
                           FunctionOptimizerContext* ctx,
                           GraphDef* optimized_graph) {
-  VLOG(2) << "Specialize function instantiation: "
-          << SummarizeNodeDef(func_node);
-
   const std::unordered_map<string, AttrValue> func_attr(
       func_node.attr().begin(), func_node.attr().end());
 
@@ -224,20 +141,20 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
   TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
 
   // TODO(ezhulenev): Push down const inputs and known input shapes.
-  FunctionDef specialized_func;
-  TF_RETURN_IF_ERROR(MakeFunctionDef(item, flib, &specialized_func));
+  FunctionDef specialized;
+  TF_RETURN_IF_ERROR(MakeSpecializedFunctionDef(item, flib, &specialized));
 
   // Find a name for specialized function.
   const string specialized_func_name =
       UniqueSpecializedFunctionName(func, func_node, flib);
 
-  specialized_func.mutable_signature()->set_name(specialized_func_name);
-  auto* specialized_attr = specialized_func.mutable_attr();
+  specialized.mutable_signature()->set_name(specialized_func_name);
+  auto* specialized_attr = specialized.mutable_attr();
   (*specialized_attr)[kGrapplerSpecializedFuncAttr].set_b(true);
 
   // Add specialized function to the library.
   TF_RETURN_IF_ERROR(
-      ctx->mutable_function_library().AddFunctionDef(specialized_func));
+      ctx->mutable_function_library().AddFunctionDef(specialized));
 
   // Add a function call node for the specialized function.
   NodeDef* specialized_func_node = optimized_graph->add_node();
@@ -309,8 +226,6 @@ Status HookInlinedFunctionOutputs(
 Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
                       const FunctionOptimizerContext& ctx,
                       GraphDef* optimized_graph) {
-  VLOG(2) << "Inline function instantiation: " << SummarizeNodeDef(func_node);
-
   const std::unordered_map<string, AttrValue> func_attr(
       func_node.attr().begin(), func_node.attr().end());
 
@@ -444,8 +359,6 @@ class SymbolicGradientEnv {
 
 Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
                               GraphDef* inlined_graph) {
-  VLOG(2) << "Inline symbolic gradient: " << SummarizeNodeDef(node);
-
   GraphDef graph_def;
 
   // Create a node to anchor the gradient inputs
@@ -541,16 +454,13 @@ Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
 
 Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                    GraphDef* optimized_graph) {
-  VLOG(2) << "Optimize function library: id=" << item.id;
-
   // Nothing to do here.
   if (item.graph.library().function_size() == 0) {
-    VLOG(3) << "Skip Grappler item with empty function library";
     *optimized_graph = item.graph;
     return Status::OK();
   }
 
-  FunctionOptimizerContext ctx(opt_level_, item);
+  FunctionOptimizerContext ctx(item, opt_level_);
   SymbolicGradientEnv env(item.graph.versions().producer(),
                           item.graph.library());
 
@@ -596,11 +506,9 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     *optimized_graph->add_node() = node;
   }
 
+  // TODO(bsteiner): trim the library to remove unused function definitions
   *optimized_graph->mutable_versions() = item.graph.versions();
-  *optimized_graph->mutable_library() =
-      options_.enable_trim_function_library
-          ? TrimFunctionLibrary(ctx.function_library(), *optimized_graph)
-          : ctx.function_library().ToProto();
+  *optimized_graph->mutable_library() = ctx.function_library().ToProto();
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.h b/tensorflow/core/grappler/optimizers/function_optimizer.h
index e307b4e533..c555fadf83 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.h
@@ -26,9 +26,8 @@ namespace grappler {
 // operations to make the overall graph more efficient.
 class FunctionOptimizer : public GraphOptimizer {
  public:
-  explicit FunctionOptimizer(RewriterConfig::Toggle opt_level)
-      : opt_level_(opt_level) {}
-  ~FunctionOptimizer() override = default;
+  FunctionOptimizer(RewriterConfig::Toggle opt_level) : opt_level_(opt_level) {}
+  ~FunctionOptimizer() override {}
 
   string name() const override { return "function_optimizer"; };
 
@@ -45,7 +44,6 @@ class FunctionOptimizer : public GraphOptimizer {
     bool enable_function_inlining = true;
     bool enable_function_specialization = true;
     bool enable_symbolic_gradient_inlining = true;
-    bool enable_trim_function_library = true;
   };
 
   RewriterConfig::Toggle opt_level_;
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index 6147e8a27c..fb006d4868 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -31,8 +31,20 @@ constexpr char kDevice[] = "/device:CPU:0";
 
 class FunctionOptimizerTest : public GrapplerTest {
  protected:
-  void DisableFunctionSpecialization(FunctionOptimizer* optimizer) {
+  void DisableAll(FunctionOptimizer* optimizer) {
+    optimizer->options_.enable_function_inlining = false;
     optimizer->options_.enable_function_specialization = false;
+    optimizer->options_.enable_symbolic_gradient_inlining = false;
+  }
+
+  void EnableOnlyFunctionInlining(FunctionOptimizer* optimizer) {
+    DisableAll(optimizer);
+    optimizer->options_.enable_function_inlining = true;
+  }
+
+  void EnableOnlyFunctionSpecialization(FunctionOptimizer* optimizer) {
+    DisableAll(optimizer);
+    optimizer->options_.enable_function_specialization = true;
   }
 };
 
@@ -340,7 +352,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithoutInput) {
   using test::function::NDef;
 
   FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
-  DisableFunctionSpecialization(&optimizer);  // do not specialize noinline func
+  EnableOnlyFunctionInlining(&optimizer);
 
   const Tensor kTwo = test::AsScalar<int64>(2);
   FunctionDef func = FunctionDefHelper::Define(
@@ -614,13 +626,14 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) {
   using test::function::NDef;
 
   FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+  EnableOnlyFunctionSpecialization(&optimizer);
 
-  // Mark XTimesTwo as noinline.
+  // Mark XTimesTwo as noinline
   FunctionDef x_times_two = test::function::XTimesTwo();
   (*x_times_two.mutable_attr())["_noinline"].set_b(true);
   std::vector<FunctionDef> function_library = {x_times_two};
 
-  // Build a graph to compute y = XTimesTwo(x).
+  // Build a graph to compute y = XTimesTwo(x)
   GrapplerItem item;
   item.graph = test::function::GDef(
       {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
@@ -631,13 +644,12 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) {
   GraphDef output;
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  // Make sure that specialized function was added to the library and original
-  // function was removed.
-  EXPECT_EQ(1, output.library().function_size());
+  // Make sure that specialized function was added to the library
+  EXPECT_EQ(2, output.library().function_size());
   EXPECT_EQ("XTimesTwo_specialized_for_y",
-            output.library().function(0).signature().name());
+            output.library().function(1).signature().name());
 
-  // And 'y' node is calling specialized function.
+  // And 'y' node is calling specialized function
   int count = 0;
   for (const NodeDef& node : output.node()) {
     if (node.name() == "y" && count++) {
@@ -646,7 +658,7 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) {
   }
   EXPECT_EQ(1, count);
 
-  // And that graph evaluation yields the same result.
+  // And that graph evaluation yields the same result
   Tensor pi = test::AsScalar<float>(3.14f);
   item.fetch = {"z"};
   item.feed.emplace_back("x", pi);
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index cdc4698c34..558b8a77e8 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
-#include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
@@ -30,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
 #include "tensorflow/core/grappler/utils/colocation.h"
-#include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -38,9 +36,6 @@ namespace tensorflow {
 namespace grappler {
 
 namespace {
-
-constexpr int kDefaultNumberOfIterations = 1;
-
 int64 NumEdges(const GraphDef& graph) {
   int64 num_edges = 0;
   for (const auto& node : graph.node()) {
@@ -55,138 +50,144 @@ string PrintSizesBeforeAfter(const GraphDef& before, const GraphDef& after) {
                          NumEdges(after), " edges (",
                          NumEdges(after) - NumEdges(before), ")");
 }
-
-int NumIterations(const RewriterConfig& cfg) {
-  return cfg.meta_optimizer_iterations() == RewriterConfig::DEFAULT_NUM_ITERS
-             ? kDefaultNumberOfIterations
-             : cfg.meta_optimizer_iterations();
-}
-
-// Check if optimizer is allowed to run only once.
-int IsRunOnceOptimizer(const string& name) { return name == "layout"; }
-
 }  // namespace
 
-std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
-    const string& optimizer) const {
-#define MK_OPT(NAME, VALUE) \
-  if (optimizer == NAME) return std::unique_ptr<GraphOptimizer>(VALUE)
-
-  MK_OPT("pruning", new ModelPruner());
-  MK_OPT("function", new FunctionOptimizer(cfg_.function_optimization()));
-  MK_OPT("constfold", new ConstantFolding(cpu_device_));
-  MK_OPT("layout", new LayoutOptimizer());
-  MK_OPT("memory", new MemoryOptimizer(RewriterConfig::MANUAL));
-  MK_OPT("arithmetic", new ArithmeticOptimizer(cfg_.arithmetic_optimization()));
-  MK_OPT("autoparallel", new AutoParallel(cfg_.auto_parallel().num_replicas()));
-  MK_OPT("loop", new LoopOptimizer(cfg_.loop_optimization()));
-  MK_OPT("dependency", new DependencyOptimizer(cfg_.dependency_optimization()));
-  MK_OPT("debug_stripper", new DebugStripper());
-
-  return std::unique_ptr<GraphOptimizer>();
-#undef MK_OPT
-}
-
-Status MetaOptimizer::InitializeOptimizers(
-    std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const {
-  if (!cfg_.disable_model_pruning()) {
-    optimizers->emplace_back(new ModelPruner());
+std::unique_ptr<GraphOptimizer> MetaOptimizer::NewOptimizer(
+    const string& optimizer) {
+  std::unique_ptr<GraphOptimizer> graph_optimizer;
+  if (optimizer == "pruning") {
+    graph_optimizer.reset(new ModelPruner());
   }
-  if (cfg_.function_optimization() != RewriterConfig::OFF) {
-    optimizers->emplace_back(
-        new FunctionOptimizer(cfg_.function_optimization()));
+  if (optimizer == "function") {
+    graph_optimizer.reset(new FunctionOptimizer(cfg_.function_optimization()));
   }
-  if (cfg_.debug_stripper() == RewriterConfig::ON) {
-    optimizers->emplace_back(new DebugStripper());
+  if (optimizer == "constfold") {
+    graph_optimizer.reset(new ConstantFolding(cpu_device_));
   }
-  if (cfg_.constant_folding() != RewriterConfig::OFF) {
-    optimizers->emplace_back(
-        new ConstantFolding(cfg_.constant_folding(), cpu_device_));
+  if (optimizer == "layout") {
+    graph_optimizer.reset(new LayoutOptimizer());
   }
-  if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
-    optimizers->emplace_back(
-        new ArithmeticOptimizer(cfg_.arithmetic_optimization()));
+  if (optimizer == "memory") {
+    graph_optimizer.reset(new MemoryOptimizer(RewriterConfig::MANUAL));
   }
-  if (cfg_.loop_optimization() != RewriterConfig::OFF) {
-    optimizers->emplace_back(new LoopOptimizer(cfg_.loop_optimization()));
+  if (optimizer == "arithmetic") {
+    graph_optimizer.reset(
+        new ArithmeticOptimizer(cfg_.arithmetic_optimization()));
   }
-  if (cfg_.dependency_optimization() != RewriterConfig::OFF) {
-    optimizers->emplace_back(
-        new DependencyOptimizer(cfg_.dependency_optimization()));
+  if (optimizer == "autoparallel") {
+    graph_optimizer.reset(
+        new AutoParallel(cfg_.auto_parallel().num_replicas()));
   }
-  if (cfg_.layout_optimizer() != RewriterConfig::OFF) {
-    optimizers->emplace_back(new LayoutOptimizer());
+  if (optimizer == "loop") {
+    graph_optimizer.reset(new LoopOptimizer(cfg_.loop_optimization()));
   }
-  if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) {
-    if (cfg_.memory_optimizer_target_node_name_scope().empty()) {
-      optimizers->emplace_back(
-          // Use the default target node name prefix "gradients/"
-          new MemoryOptimizer(cfg_.memory_optimization()));
-    } else {
-      optimizers->emplace_back(
-          new MemoryOptimizer(cfg_.memory_optimization(),
-                              cfg_.memory_optimizer_target_node_name_scope()));
-    }
+  if (optimizer == "dependency") {
+    graph_optimizer.reset(
+        new DependencyOptimizer(cfg_.dependency_optimization()));
   }
-  if (cfg_.auto_parallel().enable()) {
-    optimizers->emplace_back(
-        new AutoParallel(cfg_.auto_parallel().num_replicas()));
+  if (optimizer == "debug_stripper") {
+    graph_optimizer.reset(new DebugStripper());
   }
-  return Status::OK();
+  return graph_optimizer;
 }
 
-Status MetaOptimizer::InitializeOptimizersByName(
-    std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const {
-  for (const string& optimizer_name : cfg_.optimizers()) {
-    auto optimizer = MakeNewOptimizer(optimizer_name);
-    if (optimizer) {
-      VLOG(2) << "Registered default graph optimizer: " << optimizer_name;
-      optimizers->push_back(std::move(optimizer));
-      continue;
+Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                               GraphDef* optimized_graph) {
+  std::vector<std::unique_ptr<GraphOptimizer>> optimizers;
+  if (cfg_.optimizers().empty()) {
+    if (!cfg_.disable_model_pruning()) {
+      optimizers.push_back(std::unique_ptr<GraphOptimizer>(new ModelPruner()));
     }
-
-    auto custom_optimizer =
-        CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer_name);
-
-    if (custom_optimizer) {
-      VLOG(2) << "Registered custom graph optimizer: " << optimizer_name;
-      TF_RETURN_IF_ERROR(custom_optimizer->Init());
-      optimizers->push_back(std::move(custom_optimizer));
-    } else {
-      VLOG(2) << "Can't register an optimizer by name: " << optimizer_name;
+    if (cfg_.function_optimization() != RewriterConfig::OFF) {
+      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
+          new FunctionOptimizer(cfg_.function_optimization())));
+    }
+    if (cfg_.debug_stripper() == RewriterConfig::ON) {
+      optimizers.push_back(
+          std::unique_ptr<GraphOptimizer>(new DebugStripper()));
+    }
+    if (cfg_.constant_folding() != RewriterConfig::OFF) {
+      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
+          new ConstantFolding(cfg_.constant_folding(), cpu_device_)));
+    }
+    if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
+      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
+          new ArithmeticOptimizer(cfg_.arithmetic_optimization())));
+    }
+    if (cfg_.loop_optimization() != RewriterConfig::OFF) {
+      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
+          new LoopOptimizer(cfg_.loop_optimization())));
+    }
+    if (cfg_.dependency_optimization() != RewriterConfig::OFF) {
+      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
+          new DependencyOptimizer(cfg_.dependency_optimization())));
+    }
+    if (cfg_.layout_optimizer() != RewriterConfig::OFF) {
+      optimizers.push_back(
+          std::unique_ptr<GraphOptimizer>(new LayoutOptimizer()));
+    }
+    if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) {
+      if (cfg_.memory_optimizer_target_node_name_scope().empty()) {
+        optimizers.push_back(std::unique_ptr<GraphOptimizer>(
+            // Use the default target node name prefix "gradients/"
+            new MemoryOptimizer(cfg_.memory_optimization())));
+      } else {
+        optimizers.push_back(
+            std::unique_ptr<GraphOptimizer>(new MemoryOptimizer(
+                cfg_.memory_optimization(),
+                cfg_.memory_optimizer_target_node_name_scope())));
+      }
+    }
+    if (cfg_.auto_parallel().enable()) {
+      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
+          new AutoParallel(cfg_.auto_parallel().num_replicas())));
+    }
+  } else {
+    const std::set<string> available_optimizers = {
+        "pruning",    "function",      "constfold",  "layout",
+        "memory",     "autoparallel",  "arithmetic", "loop",
+        "dependency", "debug_stripper"};
+    std::vector<string> custom_optimizer_names;
+    for (const auto& optimizer_name : cfg_.optimizers()) {
+      if (available_optimizers.find(optimizer_name) !=
+          available_optimizers.end()) {
+        optimizers.push_back(NewOptimizer(optimizer_name));
+      } else {
+        custom_optimizer_names.push_back(optimizer_name);
+      }
+    }
+    // Now run the custom optimizers.
+    for (const auto& optimizer_name : custom_optimizer_names) {
+      std::unique_ptr<CustomGraphOptimizer> opt =
+          CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer_name);
+      if (opt == nullptr) continue;
+      TF_RETURN_IF_ERROR(opt->Init());
+      optimizers.push_back(std::move(opt));
     }
   }
-  return Status::OK();
-}
-
-Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
-                                    GraphDef* optimized_graph) {
-  VLOG(2) << "Optimize GrapplerItem: item.id=" << item.id;
-
-  std::vector<std::unique_ptr<GraphOptimizer>> optimizers;
-  bool register_by_name = !cfg_.optimizers().empty();
-  TF_RETURN_IF_ERROR(register_by_name ? InitializeOptimizersByName(&optimizers)
-                                      : InitializeOptimizers(&optimizers));
 
   if (optimizers.empty()) {
     *optimized_graph = item.graph;
     return Status::OK();
   }
 
-  // Invariant: optimized_graph contains the most recently optimized version of
-  // the graph.
+  // Some optimizers should be run only once.
+  const std::set<string> run_once_optimizers = {"layout"};
+  bool already_optimized = false;
+  const int num_iterations =
+      cfg_.meta_optimizer_iterations() == RewriterConfig::DEFAULT_NUM_ITERS
+          ? 1
+          : cfg_.meta_optimizer_iterations();
   GrapplerItem optimized_item = item;
   optimized_graph->Swap(&optimized_item.graph);
-
-  GraphOptimizationResult optimization_result(item.id);
-
-  for (int iteration = 0; iteration < NumIterations(cfg_); ++iteration) {
-    VLOG(4) << "Starting optimization iteration " << iteration + 1;
-
+  for (int iteration = 0; iteration < num_iterations; ++iteration) {
+    VLOG(1) << "Starting optimization iteration " << iteration + 1;
     for (const auto& optimizer : optimizers) {
-      // Some optimizers can run only once.
-      if (iteration > 0 && IsRunOnceOptimizer(optimizer->name())) continue;
-
+      // Invariant: optimized_graph contains the most recently optimized
+      // version of the graph.
+      if (iteration > 0 && run_once_optimizers.count(optimizer->name())) {
+        continue;
+      }
       uint64 start_us = Env::Default()->NowMicros();
       // This swaps the current optimized_graph into optimized item and
       // resets optimized_graph to an empty graph.
@@ -194,118 +195,45 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
       *optimized_graph = GraphDef();
       Status status =
           optimizer->Optimize(cluster, optimized_item, optimized_graph);
-      uint64 end_us = Env::Default()->NowMicros();
 
+      uint64 end_us = Env::Default()->NowMicros();
+      float duration_ms = (end_us - start_us) / 1000.0f;
       string result;
       if (!status.ok()) {
+        VLOG(1) << "Not able to apply optimizer " << optimizer->name() << ": "
+                << status.ToString();
         optimized_graph->Swap(&optimized_item.graph);
         result = status.ToString();
       } else {
-        optimization_result.is_optimized = true;
-        float duration_ms = (end_us - start_us) / 1000.0f;
+        already_optimized = true;
         result = strings::StrCat(
+            optimizer->name(), ": ",
             PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph),
             ", time = ", duration_ms, "ms.");
       }
-      VLOG(4) << optimizer->name() << ": " << result;
-
-      OptimizerResult optimizer_result{optimizer->name(), result};
-      optimization_result.results.push_back(optimizer_result);
+      result_.emplace_back(optimizer->name(), result);
+      VLOG(1) << result;
     }
   }
 
-  // Record graph optimization result.
-  optimization_results_.push_back(optimization_result);
-
-  if (optimization_result.is_optimized) {
+  if (already_optimized) {
     TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph));
     ReassignColocation(optimized_graph);
-    // Make sure that the optimizers preserved the graph version.
+    // Make sure that the optimizers preserved the graph version and library.
+    DCHECK_GE(optimized_graph->library().function_size(),
+              item.graph.library().function_size());
+    DCHECK_GE(optimized_graph->library().gradient_size(),
+              item.graph.library().gradient_size());
     DCHECK_EQ(optimized_graph->versions().producer(),
               item.graph.versions().producer());
   }
-
-  return Status::OK();
-}
-
-Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
-                               GraphDef* optimized_graph) {
-  optimization_results_.clear();
-
-  // 1. Optimize main graph
-  TF_RETURN_IF_ERROR(OptimizeGraph(cluster, item, optimized_graph));
-
-  // 2. Optimize function library
-  FunctionLibraryDefinition flib(OpRegistry::Global(),
-                                 optimized_graph->library());
-
-  // Optimize each function only once.
-  std::unordered_set<string> optimized_funcs;
-  bool optimize_function_library = true;
-
-  // TODO(ezhulenev): turn it on after fixing ranklab: tune_tf_test.
-  cfg_.set_constant_folding(RewriterConfig::OFF);
-  cfg_.set_arithmetic_optimization(RewriterConfig::OFF);
-
-  while (optimize_function_library) {
-    optimize_function_library = false;
-
-    for (const FunctionDef& func : optimized_graph->library().function()) {
-      const string& func_name = func.signature().name();
-
-      // Skip already optimized functions.
-      if (optimized_funcs.find(func_name) != optimized_funcs.end()) continue;
-
-      // Skip parametrized functions (function type or body is defined only at
-      // function call time by caller node attributes).
-      if (IsParametrized(func)) continue;
-
-      VLOG(3) << "Optimize function: function=" << func_name;
-
-      // Function optimization might specialize nested function calls, so we
-      // have to reset the flag and do at least one more pass over the library.
-      optimize_function_library = true;
-      optimized_funcs.insert(func_name);
-
-      // Make a GrapplerItem from a FunctionDef.
-      GrapplerFunctionItem func_item;
-      TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, flib, &func_item));
-
-      // Optimize function body graph.
-      GraphDef optimized_func_graph;
-      TF_RETURN_IF_ERROR(
-          OptimizeGraph(cluster, func_item, &optimized_func_graph));
-
-      // Function body optimization might have created new specialized
-      // functions, add them to the library.
-      TF_RETURN_IF_ERROR(flib.AddLibrary(optimized_func_graph.library()));
-
-      // Convert optimized graph back to FunctionDef.
-      FunctionDef optimized_func;
-      func_item.SwapFunctionBody(std::move(optimized_func_graph));
-      TF_RETURN_IF_ERROR(MakeFunctionDef(func_item, flib, &optimized_func));
-
-      // Replace optimized function with a new FunctionDef.
-      TF_RETURN_IF_ERROR(flib.RemoveFunction(func_name));
-      TF_RETURN_IF_ERROR(flib.AddFunctionDef(optimized_func));
-    }
-
-    // If optimized at least one function, update the graph library.
-    if (optimize_function_library) {
-      *optimized_graph->mutable_library() = flib.ToProto();
-    }
-  }
-
   return Status::OK();
 }
 
 void MetaOptimizer::PrintResult() {
-  for (const GraphOptimizationResult& graph_result : optimization_results_) {
-    LOG(INFO) << "Optimization results for grappler item: " << graph_result.id;
-    for (const OptimizerResult& result : graph_result.results) {
-      LOG(INFO) << "Return status of optimizer " << result.optimizer_name
-                << ": " << result.result;
-    }
+  for (const auto& result : result_) {
+    LOG(INFO) << "Return status of optimizer " << result.first << ": "
+              << result.second;
   }
 }
 
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index 7cf9a40c2d..382cfe51d4 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -30,7 +30,7 @@ class MetaOptimizer : public GraphOptimizer {
  public:
   MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg)
       : cpu_device_(cpu_device), cfg_(cfg) {}
-  ~MetaOptimizer() override = default;
+  ~MetaOptimizer() override {}
 
   string name() const override { return "meta_optimizer"; };
 
@@ -43,37 +43,10 @@ class MetaOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override;
 
  private:
-  std::unique_ptr<GraphOptimizer> MakeNewOptimizer(
-      const string& optimizer) const;
-
-  // Initialize active optimizers from RewriterConfig toggles.
-  Status InitializeOptimizers(
-      std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const;
-  // Initialize active optimizers from RewriterConfig optimizer names.
-  Status InitializeOptimizersByName(
-      std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const;
-
-  // Run optimization pass over a single GrapplerItem. Meta optimizer might run
-  // multiple such passes: 1) for the main graph 2) for the function library
-  Status OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
-                       GraphDef* optimized_graph);
-
+  std::unique_ptr<GraphOptimizer> NewOptimizer(const string& optimizer);
   DeviceBase* const cpu_device_;  // may be NULL
   RewriterConfig cfg_;
-
-  struct OptimizerResult {
-    string optimizer_name;
-    string result;
-  };
-
-  struct GraphOptimizationResult {
-    explicit GraphOptimizationResult(const string& id) : id(id) {}
-    string id;
-    bool is_optimized = false;
-    std::vector<OptimizerResult> results;
-  };
-
-  std::vector<GraphOptimizationResult> optimization_results_;
+  std::vector<std::pair<string, string>> result_;
 };
 
 bool MetaOptimizerEnabled(const RewriterConfig& cfg);
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index 8793ad9633..d9a386b9be 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -16,14 +16,11 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/framework/function_testlib.h"
-#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -31,8 +28,6 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-constexpr char kDevice[] = "/device:CPU:0";
-
 class TestOptimizer : public CustomGraphOptimizer {
  public:
   static void SetOptimized(const bool flag_value) { optimized_ = flag_value; }
@@ -61,9 +56,7 @@ bool TestOptimizer::optimized_;
 
 REGISTER_GRAPH_OPTIMIZER(TestOptimizer);
 
-class MetaOptimizerTest : public GrapplerTest {};
-
-TEST_F(MetaOptimizerTest, RunsCustomOptimizer) {
+TEST(MetaOptimizerTest, RunsCustomOptimizer) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
@@ -79,7 +72,7 @@ TEST_F(MetaOptimizerTest, RunsCustomOptimizer) {
   EXPECT_TRUE(TestOptimizer::IsOptimized());
 }
 
-TEST_F(MetaOptimizerTest, RunOptimizersTwice) {
+TEST(MetaOptimizerTest, RunOptimizersTwice) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
@@ -93,167 +86,6 @@ TEST_F(MetaOptimizerTest, RunOptimizersTwice) {
   TF_EXPECT_OK(status);
 }
 
-TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
-  using test::function::NDef;
-
-  // Enable ony function optimization.
-  RewriterConfig rewriter_config;
-  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
-  rewriter_config.set_function_optimization(RewriterConfig::ON);
-  rewriter_config.add_optimizers("function");
-
-  MetaOptimizer optimizer(nullptr, rewriter_config);
-
-  // Define function library:
-  //
-  //   MyMul(x, y)    = x * y
-  //  *MySquare(x)    = MyMul(x, x)
-  //  *MyQuadratic(x) = MySquare(MySquare(x))
-  //
-  //  * - marked as noinline
-
-  FunctionDef mul_func = FunctionDefHelper::Create(
-      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
-      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
-      /* Mapping between function returns and function node outputs. */
-      {{"z", "mul:z:0"}});
-
-  FunctionDef square_func = FunctionDefHelper::Create(
-      "MySquare", {"x:T"}, {"z:T"}, {"T: {float, double}"},
-      {{{"my_mul"}, "MyMul", {"x", "x"}, {{"T", "$T"}}}},
-      /* Mapping between function returns and function node outputs. */
-      {{"z", "my_mul:z:0"}});
-  (*square_func.mutable_attr())["_noinline"].set_b(true);
-
-  FunctionDef quadratic_func = FunctionDefHelper::Create(
-      "MyQuadratic", {"x:T"}, {"z:T"}, {"T: {float, double}"},
-      {{{"square"}, "MySquare", {"x"}, {{"T", "$T"}}},
-       {{"quadratic"}, "MySquare", {"square:z"}, {{"T", "$T"}}}},
-      /* Mapping between function returns and function node outputs. */
-      {{"z", "quadratic:z:0"}});
-  (*quadratic_func.mutable_attr())["_noinline"].set_b(true);
-
-  // Tensorflow graph:
-  //
-  //   a = tf.Placeholder(tf.float);
-  //   b = tf.Placeholder(tf.int32);
-  //
-  //   square = MySquare(a);        // a^2
-  //   quadratic = MyQuadratic(b);  // b^4
-  GrapplerItem item;
-  item.graph = test::function::GDef(
-      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
-       NDef("b", "Placeholder", {}, {{"dtype", DT_INT32}}, kDevice),
-       // Calls into function library
-       NDef("square", "MySquare", {"a"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("quadratic", "MyQuadratic", {"b"}, {{"T", DT_INT32}}, kDevice),
-       // Forward outputs
-       NDef("out_s", "Identity", {"square:0"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("out_q", "Identity", {"quadratic:0"}, {{"T", DT_INT32}}, kDevice)},
-      // FunctionLib
-      {mul_func, square_func, quadratic_func});
-
-  GraphDef output;
-  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
-
-  FunctionLibraryDefinition optimized_flib(OpRegistry::Global(),
-                                           output.library());
-
-  // Specialized and optimized functions should be added to the graph.
-  EXPECT_EQ(6, optimized_flib.num_functions());
-
-  // MyQuadratic should be specialized once:
-  //   0. 'quadratic' node in the main graph
-  const string optimized_0 = "MyQuadratic_specialized_for_quadratic";
-
-  // MySquare should be specialized and optimized for 3 instantiations:
-  //   1. 'square' node in the main graph
-  //   2. 'square' node in the MyQuadratic specialization
-  //   3. 'quadratic' node in the MyQuadratic specialization
-
-  const string optimized_1 = "MySquare_specialized_for_square";
-  const string optimized_2 = "MySquare_specialized_for_square_1";
-  const string optimized_3 = "MySquare_specialized_for_quadratic";
-
-  const FunctionDef* optimized_func_0 = optimized_flib.Find(optimized_0);
-  const FunctionDef* optimized_func_1 = optimized_flib.Find(optimized_1);
-  const FunctionDef* optimized_func_2 = optimized_flib.Find(optimized_2);
-  const FunctionDef* optimized_func_3 = optimized_flib.Find(optimized_3);
-
-  ASSERT_NE(optimized_func_0, nullptr);
-  ASSERT_NE(optimized_func_1, nullptr);
-  ASSERT_NE(optimized_func_2, nullptr);
-  ASSERT_NE(optimized_func_3, nullptr);
-
-  // Graph should call optimized function.
-  int count = 0;
-  for (const NodeDef& node : output.node()) {
-    if (node.name() == "square" && count++) {
-      EXPECT_EQ("MySquare_specialized_for_square", node.op());
-    } else if (node.name() == "quadratic" && count++) {
-      EXPECT_EQ("MyQuadratic_specialized_for_quadratic", node.op());
-    }
-  }
-  EXPECT_EQ(2, count);
-
-  // Specialized MySquare should call specialized functions.
-  count = 0;
-  for (const NodeDef& node : optimized_func_0->node_def()) {
-    if (node.name() == "square" && count++) {
-      EXPECT_EQ(optimized_2, node.op());
-    } else if (node.name() == "quadratic" && count++) {
-      EXPECT_EQ(optimized_3, node.op());
-    }
-  }
-  EXPECT_EQ(2, count);
-
-  const std::vector<const FunctionDef*> optimized_funcs = {
-      optimized_func_1, optimized_func_1, optimized_func_3};
-
-  // MyMul should be inlined into all optimized versions of MySquare.
-  for (const FunctionDef* optimized_func : optimized_funcs) {
-    count = 0;
-    for (const NodeDef& node : optimized_func->node_def()) {
-      if (node.name() == "my_mul/inlined_inputs" && count++) {
-        EXPECT_EQ("IdentityN", node.op());
-        EXPECT_EQ(2, node.input_size());
-        EXPECT_EQ("x:0", node.input(0));
-        EXPECT_EQ("x:0", node.input(1));
-      } else if (node.name() == "my_mul/x" && count++) {
-        EXPECT_EQ("Identity", node.op());
-        EXPECT_EQ(1, node.input_size());
-        EXPECT_EQ("my_mul/inlined_inputs:output:0", node.input(0));
-      } else if (node.name() == "my_mul/y" && count++) {
-        EXPECT_EQ("Identity", node.op());
-        EXPECT_EQ(1, node.input_size());
-        EXPECT_EQ("my_mul/inlined_inputs:output:1", node.input(0));
-      } else if (node.name() == "my_mul/mul" && count++) {
-        EXPECT_EQ("Mul", node.op());
-        EXPECT_EQ(2, node.input_size());
-        EXPECT_EQ("my_mul/x:output:0", node.input(0));
-        EXPECT_EQ("my_mul/y:output:0", node.input(1));
-      } else if (node.name() == "my_mul" && count++) {
-        EXPECT_EQ("IdentityN", node.op());
-        EXPECT_EQ(1, node.input_size());
-        EXPECT_EQ("my_mul/mul:z:0", node.input(0));
-      }
-      EXPECT_TRUE(node.device().empty());
-    }
-    EXPECT_EQ(5, count);
-  }
-
-  item.fetch = {"out_s", "out_q"};
-  item.feed.emplace_back("a", test::AsScalar<float>(2.0f));
-  item.feed.emplace_back("b", test::AsScalar<int>(4));
-  auto tensors_expected = EvaluateFetchNodes(item);
-
-  GrapplerItem optimized(item, std::move(output));
-  auto tensors = EvaluateFetchNodes(optimized);
-
-  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
-  test::ExpectTensorEqual<int>(tensors_expected[1], tensors[1]);
-}
-
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index 790809bc67..638fe1999a 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -545,12 +545,6 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
   return Status::OK();
 }
 
-Status MakeGrapplerFunctionItem(const FunctionDef& func,
-                                const FunctionLibraryDefinition& flib,
-                                GrapplerFunctionItem* item) {
-  return MakeGrapplerFunctionItem(func, AttrValueMap(), flib, item);
-}
-
 // Register GrapplerFunctionItem input arg expansion and function body outputs
 // in the GrapplerFunctionConnectivity.
 Status RegisterGrapplerFunctionConnectivity(
@@ -566,9 +560,9 @@ Status RegisterGrapplerFunctionConnectivity(
   return Status::OK();
 }
 
-Status MakeFunctionDef(const GrapplerFunctionItem& item,
-                       const FunctionLibraryDefinition& flib,
-                       FunctionDef* func) {
+Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item,
+                                  const FunctionLibraryDefinition& flib,
+                                  FunctionDef* func) {
   func->mutable_signature()->set_name(item.id);
   func->mutable_signature()->set_is_stateful(item.is_stateful());
 
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index 5e8b6c6960..ab369bcad7 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -38,8 +38,7 @@ using AttrValueMap = std::unordered_map<string, AttrValue>;
 // function body in place of function inputs and a resolved input data type.
 struct InputArgExpansion {
   // TODO(ezhulenev): Add support for functions with tensor sequence inputs of
-  // different data types.
-  // TODO(ezhulenev): Support type parametrized inputs?
+  // different data types
   string input_name;                 // name of the function input argument
   DataType data_type;                // input data type
   bool is_ref;                       // if true, inputs are required to be refs
@@ -54,8 +53,7 @@ struct InputArgExpansion {
 // tensors of a function body nodes and a resolved output data type
 struct OutputArgExpansion {
   // TODO(ezhulenev): Add support for functions with tensor sequence outputs of
-  // different data types.
-  // TODO(ezhulenev): Support type parametrized outputs?
+  // different data types
   string output_name;                  // name of the function output argument
   DataType data_type;                  // output data type
   bool is_ref;                         // if true, outputs are refs
@@ -188,6 +186,13 @@ bool HasParametrizedBody(const FunctionDef& func);
 // Check if function has parametrized type or body.
 bool IsParametrized(const FunctionDef& func);
 
+// Make a GrapplerFunctionItem from the function definition and attributes.
+// Return error if the given function def cannot be converted.
+Status MakeGrapplerFunctionItem(
+    const FunctionDef& func,
+    const std::unordered_map<string, AttrValue>& func_instantiation_attr,
+    const FunctionLibraryDefinition& flib, GrapplerFunctionItem* item);
+
 // Register GrapplerFunctionItem input arg expansion and function body outputs
 // in the GrapplerFunctionConnectivity.  Use function library definition to
 // lookup function body nodes output names and ranges.
@@ -195,28 +200,11 @@ Status RegisterGrapplerFunctionConnectivity(
     const GrapplerFunctionItem& item, const FunctionLibraryDefinition& flib,
     GrapplerFunctionConnectivity* connectivity);
 
-// Make a GrapplerFunctionItem from the function definition and function
-// instantiation attributes (caller node attributes). Returns error if the given
-// function def cannot be converted (e.g. not all attributes are defined).
-Status MakeGrapplerFunctionItem(
-    const FunctionDef& func,
-    const std::unordered_map<string, AttrValue>& func_instantiation_attr,
-    const FunctionLibraryDefinition& flib, GrapplerFunctionItem* item);
-
-// Make a GrapplerFunction item from the function definition. Function must be
-// fully defined (no type or body parametrization).
-// TODO(ezhulenev): Support parametrized functions without fully defined
-// instantiation attributes? Do we ever want to optimize parametrized function
-// without specializing it to it's instantiation attributes (at least types)?
-Status MakeGrapplerFunctionItem(const FunctionDef& func,
-                                const FunctionLibraryDefinition& flib,
-                                GrapplerFunctionItem* item);
-
-// Make a FunctionDef from the GrapplerFunctionItem. Use function library
-// definition to lookup function body nodes output names and ranges.
-Status MakeFunctionDef(const GrapplerFunctionItem& item,
-                       const FunctionLibraryDefinition& flib,
-                       FunctionDef* func);
+// Make a specialized FunctionDef from the GrapplerFunctionItem. Use function
+// library definition to lookup function body nodes output names and ranges.
+Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item,
+                                  const FunctionLibraryDefinition& flib,
+                                  FunctionDef* func);
 
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 6dfd49b943..54d235a8a4 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -524,7 +524,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
   EXPECT_EQ("two", cast.input(0));
 }
 
-TEST_F(FunctionsTest, MakeFunctionDef) {
+TEST_F(FunctionsTest, MakeSpecializedFunctionDef) {
   const Tensor kTwo = test::AsScalar<int64>(2);
   FunctionDef func = FunctionDefHelper::Define(
       // Name
@@ -550,7 +550,7 @@ TEST_F(FunctionsTest, MakeFunctionDef) {
   TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
 
   FunctionDef specialized;
-  TF_EXPECT_OK(MakeFunctionDef(item, flib, &specialized));
+  TF_EXPECT_OK(MakeSpecializedFunctionDef(item, flib, &specialized));
 
   // Input and output types are resolved based on instantiation attributes.
   EXPECT_EQ("x", specialized.signature().input_arg(0).name());
@@ -573,7 +573,7 @@ TEST_F(FunctionsTest, MakeFunctionDef) {
   EXPECT_EQ(2, count);
 }
 
-TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) {
+TEST_F(FunctionsTest, SwapFunctionBodyAndMakeSpecializedFunctionDef) {
   using test::function::NDef;
 
   FunctionDef mul_func = FunctionDefHelper::Create(
@@ -606,7 +606,7 @@ TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) {
   // Replace function body with identity function
   item.SwapFunctionBody(std::move(id_func_body));
   FunctionDef specialized;
-  TF_EXPECT_OK(MakeFunctionDef(item, flib, &specialized));
+  TF_EXPECT_OK(MakeSpecializedFunctionDef(item, flib, &specialized));
 
   // Check that graph body was updated.
   int count = 0;
-- 
GitLab


From 82679654af098df1de27bcdcf6fc6942ccf4f236 Mon Sep 17 00:00:00 2001
From: ADiegoCAlonso <A.Diego.C.Alonso@gmail.com>
Date: Sat, 21 Apr 2018 11:43:51 +0200
Subject: [PATCH 266/450] Add __init__py

---
 tensorflow/examples/tutorials/estimators/__init__.py | 0
 tensorflow/examples/tutorials/input_fn/__init__.py   | 0
 tensorflow/examples/tutorials/layers/__init__.py     | 0
 tensorflow/examples/tutorials/monitors/__init__.py   | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tensorflow/examples/tutorials/estimators/__init__.py
 create mode 100644 tensorflow/examples/tutorials/input_fn/__init__.py
 create mode 100644 tensorflow/examples/tutorials/layers/__init__.py
 create mode 100644 tensorflow/examples/tutorials/monitors/__init__.py

diff --git a/tensorflow/examples/tutorials/estimators/__init__.py b/tensorflow/examples/tutorials/estimators/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tensorflow/examples/tutorials/input_fn/__init__.py b/tensorflow/examples/tutorials/input_fn/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tensorflow/examples/tutorials/layers/__init__.py b/tensorflow/examples/tutorials/layers/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tensorflow/examples/tutorials/monitors/__init__.py b/tensorflow/examples/tutorials/monitors/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
-- 
GitLab


From aed22c552905d74de04c98b34aabedd12926790a Mon Sep 17 00:00:00 2001
From: ADiegoCAlonso <A.Diego.C.Alonso@gmail.com>
Date: Sat, 21 Apr 2018 11:56:10 +0200
Subject: [PATCH 267/450] Specify float32 as float type instead of float64

---
 tensorflow/examples/tutorials/monitors/iris_monitors.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/examples/tutorials/monitors/iris_monitors.py b/tensorflow/examples/tutorials/monitors/iris_monitors.py
index 850d105f7b..a2b7fe6023 100644
--- a/tensorflow/examples/tutorials/monitors/iris_monitors.py
+++ b/tensorflow/examples/tutorials/monitors/iris_monitors.py
@@ -32,9 +32,9 @@ IRIS_TEST = os.path.join(os.path.dirname(__file__), "iris_test.csv")
 def main(unused_argv):
   # Load datasets.
   training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-      filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float)
+      filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32)
   test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-      filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float)
+      filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32)
 
   validation_metrics = {
       "accuracy":
@@ -83,7 +83,7 @@ def main(unused_argv):
 
   # Classify two new flower samples.
   new_samples = np.array(
-      [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=float)
+      [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=np.float32)
   y = list(classifier.predict(new_samples))
   print("Predictions: {}".format(str(y)))
 
-- 
GitLab


From ddda9acc9b922a9983128fc2e47f3541b8e456bc Mon Sep 17 00:00:00 2001
From: Joe Yearsley <josephelliotyearsley@gmail.com>
Date: Sat, 21 Apr 2018 17:12:37 +0100
Subject: [PATCH 268/450] Update fold_old_batch_norms.cc

Updated as requested
---
 tensorflow/tools/graph_transforms/fold_old_batch_norms.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
index 988ba25e36..f1d361e07d 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
@@ -159,7 +159,7 @@ Status FuseScaleOffsetToConvWeights(const std::vector<float>& scale_values,
   NodeDef bias_add_node;
   bias_add_node.set_op("BiasAdd");
   bias_add_node.set_name(conv_output_name);
-  if (HasAttr(conv_node, "data_format")) {
+  if (!conv_node.attr().count("data_format")) {
     CopyNodeAttr(conv_node, "data_format", "data_format", &bias_add_node);
   }
   CopyNodeAttr(conv_node, "T", "T", &bias_add_node);
-- 
GitLab


From cea18851e2d81ee97ebf8e9f6aeddd55a34e3227 Mon Sep 17 00:00:00 2001
From: foo0x29a <thiago.nobayashi@gmail.com>
Date: Sat, 21 Apr 2018 13:30:52 -0300
Subject: [PATCH 269/450] fix typo

---
 .../core/grappler/optimizers/custom_graph_optimizer_registry.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h
index 796da91373..3148a5f809 100644
--- a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h
+++ b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h
@@ -33,7 +33,7 @@ class CustomGraphOptimizerRegistry {
   static std::vector<string> GetRegisteredOptimizers();
 
   typedef std::function<CustomGraphOptimizer*()> Creator;
-  // Regsiter graph optimizer which can be called during program initialization.
+  // Register graph optimizer which can be called during program initialization.
   // This class is not thread-safe.
   static void RegisterOptimizerOrDie(const Creator& optimizer_creator,
                                      const string& name);
-- 
GitLab


From 31dcaa089bb7e504b85807e9bdb96be2858f1b98 Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Fri, 20 Apr 2018 18:31:39 -0700
Subject: [PATCH 270/450] [XLA][Doc]Fix up operation semantics of BatchNorm.

We somehow committed an old version of the doc (see #, the lhs is what we wanted and the rhs is what got committed). This CL reverts last change to that CL.

PiperOrigin-RevId: 193751762
---
 .../performance/xla/operation_semantics.md    | 60 ++++++++++---------
 1 file changed, 32 insertions(+), 28 deletions(-)

diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index 8373a1219d..f530fe1206 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -25,7 +25,7 @@ Calculates gradients of batch norm.
 <b> `BatchNormGrad(operand, scale, mean, variance, grad_output, epsilon, feature_index)` </b>
 
 | Arguments       | Type                    | Semantics                        |
-| --------------  | ----------------------- | -------------------------------- |
+| --------------- | ----------------------- | -------------------------------- |
 | `operand`       | `ComputationDataHandle` | n dimensional array to be        |
 :                 :                         : normalized (x)                   :
 | `scale`         | `ComputationDataHandle` | 1 dimensional array              |
@@ -45,31 +45,37 @@ feature dimension in `operand`), the operation calculates the gradients with
 respect to `operand`, `offset` and `scale` across all the other dimensions. The
 `feature_index` must be a valid index for the feature dimension in `operand`.
 
-The three gradients are defined by the following formulas (Assuming a
-4-dimensional tensor as `operand` and (l) is the index for feature dimension):
-
-\\( coef_l = \frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h (\nabla y_{ijkl} * (x_{ijkl} - \mu_l) / (\sigma^2_{l}+\epsilon)) \\)
-
-\\( \nabla x_{ijkl} = \gamma_{l} * (1/\sqrt{\sigma^2_{l}+\epsilon}) * [\nabla y_{ijkl} - mean(\nabla y) - (x_{ijkl} - \mu_{l}) * coef_l] \\)
-
-\\( \nabla \beta_l = \sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h \nabla y_{ijkl} \\)
-
-\\( \nabla \gamma_l = \sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h \nabla y_{ijkl} * ((x_{ijkl} - \mu_l) / \sqrt{\sigma^2_{l}+\epsilon}) \\)
-
-The inputs `mean` and `variance` represents moments value
+The three gradients are defined by the following formulas (assuming a
+4-dimensional tensor as `operand` and with feature dimension index \\(l\\),
+batch size `m` and spatial sizes `w` and `h`):
+
+\\[ \begin{split} c_l&=
+\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h
+\left( \nabla y_{ijkl} \frac{x_{ijkl} - \mu_l}{\sigma^2_l+\epsilon} \right)
+\\\\
+\nabla x_{ijkl} &= \frac{\gamma_{l}}{\sqrt{\sigma^2_{l}+\epsilon}}
+\left( \nabla y_{ijkl} - \mathrm{mean}(\nabla y) - c_l (x_{ijkl} - \mu_{l})
+\right)
+\\\\
+\nabla \gamma_l &= \sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h \left( \nabla y_{ijkl}
+\frac{x_{ijkl} - \mu_l}{\sqrt{\sigma^2_{l}+\epsilon}} \right)
+\\\\\
+\nabla \beta_l &= \sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h \nabla y_{ijkl}
+\end{split} \\]
+
+The inputs `mean` and `variance` represent moments value
 across batch and spatial dimensions.
 
 The output type is a tuple of three handles:
 
-|Outputs       | Type                    | Semantics                            |
-|------------- | ----------------------- | ------------------------------------ |
-|`grad_operand`| `ComputationDataHandle` | gradient with respect to input       |
-:              :                         : `operand` (\\( \nabla x\\))          :
-|`grad_scale`  | `ComputationDataHandle` | gradient with respect to input       |
-:              :                         : `scale` (\\( \nabla \gamma\\))       :
-|`grad_offset` | `ComputationDataHandle` | gradient with respect to input       |
-:              :                         : `offset`(\\( \nabla \beta\\))        :
-
+| Outputs        | Type                    | Semantics                         |
+| -------------  | ----------------------- | --------------------------------- |
+| `grad_operand` | `ComputationDataHandle` | gradient with respect to input    |
+:                :                         : `operand` (\\( \nabla x\\))       :
+| `grad_scale`   | `ComputationDataHandle` | gradient with respect to input    |
+:                :                         : `scale` (\\( \nabla \gamma\\))    :
+| `grad_offset`  | `ComputationDataHandle` | gradient with respect to input    |
+:                :                         : `offset`(\\( \nabla \beta\\))     :
 
 ## BatchNormInference
 
@@ -440,13 +446,11 @@ area and a computation is performed for each possible position of the window.
 | `lhs`            | `ComputationDataHandle` | rank n+2 array of inputs      |
 | `rhs`            | `ComputationDataHandle` | rank n+2 array of kernel      |
 :                  :                         : weights                       :
-| `window_strides` | `ArraySlice<int64>`     | size n array of kernel strides|
-| `padding`        | `ArraySlice<pair<int64, | size n array of (low, high)   |
+| `window_strides` | `ArraySlice<int64>`     | n-d array of kernel strides   |
+| `padding`        | `ArraySlice<pair<int64, | n-d array of (low, high)      |
 :                  : int64>>`                : padding                       :
-| `lhs_dilation`   | `ArraySlice<int64>`     | size n lhs dilation factor    |
-:                  :                         : array                         |
-| `rhs_dilation`   | `ArraySlice<int64>`     | size n rhs dilation factor
-:                  :                         : array                         |
+| `lhs_dilation`   | `ArraySlice<int64>`     | n-d lhs dilation factor array |
+| `rhs_dilation`   | `ArraySlice<int64>`     | n-d rhs dilation factor array |
 
 Let n be the number of spatial dimensions. The `lhs` argument is a rank n+2
 array describing the base area. This is called the input, even though of course
-- 
GitLab


From 2b5d4f794cc9c2740d27c0e8c1af2b511810e00b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 18:37:55 -0700
Subject: [PATCH 271/450] [XLA] Redesign: implement XlaComputation::Snapshot,
 and Client::LoadSnapshot.

PiperOrigin-RevId: 193752146
---
 tensorflow/compiler/xla/client/client.cc              |  5 +++++
 tensorflow/compiler/xla/client/client.h               |  3 +++
 tensorflow/compiler/xla/client/xla_client/BUILD       |  2 +-
 .../compiler/xla/client/xla_client/xla_computation.cc | 11 +++++++++++
 .../compiler/xla/client/xla_client/xla_computation.h  |  4 ++++
 tensorflow/compiler/xla/service/executable.cc         |  6 +++---
 tensorflow/compiler/xla/service/executable.h          |  4 ++--
 tensorflow/compiler/xla/service/hlo.proto             |  2 +-
 8 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index f0f94298a0..328e1b8fa8 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -235,6 +235,11 @@ StatusOr<Computation> Client::LoadSnapshot(const SessionModule& module) {
   return Computation(stub_, response.computation());
 }
 
+StatusOr<XlaComputation> Client::LoadSnapshot(const HloSnapshot& module) {
+  TF_RET_CHECK(module.has_hlo() && module.hlo().has_hlo_module());
+  return XlaComputation(module.hlo().hlo_module());
+}
+
 StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
     const Computation& computation,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments,
diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index 14c685d94e..a63ff4c56d 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -255,6 +255,9 @@ class Client {
 
   StatusOr<Computation> LoadSnapshot(const SessionModule& module);
 
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<XlaComputation> LoadSnapshot(const HloSnapshot& module);
+
   ServiceInterface* stub() { return stub_; }
 
  private:
diff --git a/tensorflow/compiler/xla/client/xla_client/BUILD b/tensorflow/compiler/xla/client/xla_client/BUILD
index 31fa1241ee..0d6e207971 100644
--- a/tensorflow/compiler/xla/client/xla_client/BUILD
+++ b/tensorflow/compiler/xla/client/xla_client/BUILD
@@ -31,9 +31,9 @@ cc_library(
     hdrs = ["xla_computation.h"],
     deps = [
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo_proto",
-        "//tensorflow/core:lib",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.cc b/tensorflow/compiler/xla/client/xla_client/xla_computation.cc
index a6752c6010..72e3935696 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_computation.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_computation.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <utility>
 
+#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
 
@@ -26,4 +28,13 @@ StatusOr<ProgramShape> XlaComputation::GetProgramShape() const {
   return proto_.program_shape();
 }
 
+StatusOr<std::unique_ptr<HloSnapshot>> XlaComputation::Snapshot() const {
+  if (IsNull()) {
+    return InvalidArgument("Computation is invalid.");
+  }
+  auto session = MakeUnique<HloSnapshot>();
+  *session->mutable_hlo()->mutable_hlo_module() = proto_;
+  return std::move(session);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.h b/tensorflow/compiler/xla/client/xla_client/xla_computation.h
index 7ad212aa24..b70b57e9ff 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_computation.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_computation.h
@@ -48,6 +48,10 @@ class XlaComputation {
 
   const HloModuleProto& proto() const { return proto_; }
 
+  // Requests that we snapshot the computation into a serializable protocol
+  // buffer form.
+  StatusOr<std::unique_ptr<HloSnapshot>> Snapshot() const;
+
   // Returns true if this object is a null Computation.
   bool IsNull() const { return unique_id_ == -1; }
 
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 8218b5f7c8..be19b3ff04 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -163,9 +163,9 @@ Status Executable::DumpSessionModule() {
                                        result);
 }
 
-/* static */ Status Executable::DumpToDirectory(const string& directory_path,
-                                                string filename,
-                                                const HloSession& hlo_session) {
+/* static */ Status Executable::DumpToDirectory(
+    const string& directory_path, string filename,
+    const HloSnapshot& hlo_session) {
   tensorflow::Env* env = tensorflow::Env::Default();
   if (!env->IsDirectory(directory_path).ok()) {
     // NB! CreateDir does not work reliably with multiple XLA threads -- two
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index bdbe119120..0c95f1a361 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -156,9 +156,9 @@ class Executable {
   static Status DumpToDirectory(const string& directory_path, string filename,
                                 const SessionModule& session_module);
 
-  // Dump hlo_session to directory_path/filename.
+  // Dump hlo snapshot to directory_path/filename.
   static Status DumpToDirectory(const string& directory_path, string filename,
-                                const HloSession& hlo_session);
+                                const HloSnapshot& hlo_session);
 
  protected:
   mutable tensorflow::mutex mutex_;
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 0c3eb7dcb4..aa6860880b 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -300,7 +300,7 @@ message HloProto {
 // Encapsulates HloProto together with the arguments, result, and
 // execution_platform. This message is used for purposes such as
 // analysis/replay/file-storage.
-message HloSession {
+message HloSnapshot {
   // The hlo graph.
   HloProto hlo = 1;
 
-- 
GitLab


From 1796d17b8b1fa598627a590fad0ef81d138af558 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 20:11:19 -0700
Subject: [PATCH 272/450] Fix heuristic for computing gradients of gradients
 when there are outside_compilation clusters present, to stop creating cycles.

PiperOrigin-RevId: 193757109
---
 tensorflow/contrib/tpu/python/tpu/tpu.py | 38 +++++++-----------------
 1 file changed, 10 insertions(+), 28 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index a1690dadff..7b8786304c 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -173,36 +173,18 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
           # gradients, and put the gradient of X in cluster
           # 'root_cluster.gradient_uid'.
           #
-          # When the gradient code adds multiple Ops, it asks them to
-          # be colocated either with the original Op X, or with one of
-          # the preceding Ops that was added to the gradient. In other
-          # words, we want to detect the case where we are colocating
-          # with an Op that is in cluster root_cluster.gradient_uid
-          # and put the new Op in that same cluster if the
-          # gradient_uid is the same (the case that we are in the same
-          # invocation of gradients, and just adding new Ops to the
-          # cluster); and in a different cluster if the gradient_uids
-          # are different (the case that we are in a new invocation of
-          # gradients, taking the gradient of a previously-computed
-          # gradient).
+          # When taking a gradient of a gradient, some ops will be
+          # colocated with Op in the forward pass (e.g., cluster
+          # root_cluster) and some in the backward pass (e.g., cluster
+          # root_cluster.initial_gradient_uid). We need all of the
+          # grad-of-grad ops to be in the same cluster to avoid cyclic
+          # dependencies between clusters. We adopt a heuristic that
+          # puts any op clustered with root_cluster.<xxx> in
+          # root_cluster.gradient_uid, even if xxx was
+          # initial_gradient_uid.
           self._in_gradient_colocation = op
           parts = outside_attr.split(".")
-          if len(parts) > 1:
-            uid = parts[-1]
-            if uid == gradient_uid:
-              # Keep using the same cluster
-              cluster = outside_attr
-            else:
-              # We're taking the gradient of a gradient so make a new
-              # cluster attr, adding a new '.uid' on the end to
-              # preserve the invariant that the gradient_uid is the
-              # suffix after the last '.' in the attr.
-              cluster = outside_attr + "." + gradient_uid
-          else:
-            # We're taking the gradient of an Op in the forward pass, so
-            # make a new cluster combining the Op's cluster and the
-            # gradient id.
-            cluster = outside_attr + "." + gradient_uid
+          cluster = parts[0] + "." + gradient_uid
           self._EnterOutsideCompilationScope(cluster=cluster)
         except ValueError:
           # The attr was not present: do nothing.
-- 
GitLab


From 28b8a3c74f93f9238fa626ec7d32fbddcb56b0a8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 21 Apr 2018 08:16:47 -0700
Subject: [PATCH 273/450] Allow output has a different shape from input in the
 image.transform (#17011).

PiperOrigin-RevId: 193788768
---
 tensorflow/contrib/image/kernels/image_ops.cc |  7 ++-
 tensorflow/contrib/image/kernels/image_ops.h  |  2 +-
 tensorflow/contrib/image/ops/image_ops.cc     | 54 +++++++++++++++++--
 .../python/kernel_tests/image_ops_test.py     | 30 +++++++++++
 .../contrib/image/python/ops/image_ops.py     | 39 ++++++++------
 5 files changed, 108 insertions(+), 24 deletions(-)

diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc
index c2e32da133..ae4b1ba62a 100644
--- a/tensorflow/contrib/image/kernels/image_ops.cc
+++ b/tensorflow/contrib/image/kernels/image_ops.cc
@@ -70,6 +70,7 @@ class ImageProjectiveTransform : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor& images_t = ctx->input(0);
     const Tensor& transform_t = ctx->input(1);
+    const Tensor& output_dim = ctx->input(2);
     OP_REQUIRES(ctx, images_t.shape().dims() == 4,
                 errors::InvalidArgument("Input images must have rank 4"));
     OP_REQUIRES(ctx,
@@ -83,7 +84,11 @@ class ImageProjectiveTransform : public OpKernel {
     auto images = images_t.tensor<T, 4>();
     auto transform = transform_t.matrix<float>();
     Tensor* output_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, images_t.shape(), &output_t));
+    // Image is NHWC format.
+    auto output_shape = images_t.shape();
+    output_shape.set_dim(1, output_dim.vec<int>()(0));
+    output_shape.set_dim(2, output_dim.vec<int>()(1));
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_t));
     auto output = output_t->tensor<T, 4>();
     (FillProjectiveTransform<Device, T>(interpolation_))(
         ctx->eigen_device<Device>(), &output, images, transform);
diff --git a/tensorflow/contrib/image/kernels/image_ops.h b/tensorflow/contrib/image/kernels/image_ops.h
index ad50133061..2320329b92 100644
--- a/tensorflow/contrib/image/kernels/image_ops.h
+++ b/tensorflow/contrib/image/kernels/image_ops.h
@@ -161,7 +161,7 @@ struct FillProjectiveTransform {
   void operator()(const Device& device, OutputType* output,
                   const InputType& images,
                   const TransformsType& transform) const {
-    output->device(device) = images.generate(
+    output->device(device) = output->generate(
         ProjectiveGenerator<Device, T>(images, transform, interpolation_));
   }
 };
diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc
index 68771b3d05..e97267fb89 100644
--- a/tensorflow/contrib/image/ops/image_ops.cc
+++ b/tensorflow/contrib/image/ops/image_ops.cc
@@ -19,9 +19,55 @@ limitations under the License.
 
 namespace tensorflow {
 
+using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
+namespace {
+
+// Sets output[0] to shape [batch_dim,height,width,channel_dim], where
+// height and width come from the size_tensor.
+Status SetOutputToSizedImage(InferenceContext* c, DimensionHandle batch_dim,
+                             int size_input_idx, DimensionHandle channel_dim) {
+  // Verify shape of size input.
+  ShapeHandle size;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(size_input_idx), 1, &size));
+  DimensionHandle unused;
+  TF_RETURN_IF_ERROR(c->WithValue(c->Dim(size, 0), 2, &unused));
+
+  // Get size values from the size tensor.
+  const Tensor* size_tensor = c->input_tensor(size_input_idx);
+  DimensionHandle width;
+  DimensionHandle height;
+  if (size_tensor == nullptr) {
+    width = c->UnknownDim();
+    height = c->UnknownDim();
+  } else {
+    // TODO(petewarden) - Remove once we have constant evaluation in C++ only.
+    if (size_tensor->dtype() != DT_INT32) {
+      return errors::InvalidArgument(
+          "Bad size input type for SetOutputToSizedImage: Expected DT_INT32 "
+          "but got ",
+          DataTypeString(size_tensor->dtype()), " for input #", size_input_idx,
+          " in ", c->DebugString());
+    }
+    auto vec = size_tensor->vec<int32>();
+    height = c->MakeDim(vec(0));
+    width = c->MakeDim(vec(1));
+  }
+  c->set_output(0, c->MakeShape({batch_dim, height, width, channel_dim}));
+  return Status::OK();
+}
+
+Status ResizeShapeFn(InferenceContext* c) {
+  ShapeHandle input;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
+  return SetOutputToSizedImage(c, c->Dim(input, 0), 2 /* size_input_idx */,
+                               c->Dim(input, 3));
+}
+
+}  // namespace
+
 // TODO(ringwalt): Add a "fill_mode" argument with "constant", "mirror", etc.
 // TODO(ringwalt): Add a "fill_constant" argument for constant mode (default 0).
 // TODO(ringwalt): Add an "output_shape" argument. This is sufficient to
@@ -29,13 +75,11 @@ using shape_inference::ShapeHandle;
 REGISTER_OP("ImageProjectiveTransform")
     .Input("images: dtype")
     .Input("transforms: float32")
+    .Input("output_shape: int32")
     .Attr("dtype: {uint8, int32, int64, float32, float64}")
     .Attr("interpolation: string")
     .Output("transformed_images: dtype")
-    .SetShapeFn([](InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      return Status::OK();
-    })
+    .SetShapeFn(ResizeShapeFn)
     .Doc(R"doc(
 Applies the given transform to each of the images.
 
@@ -49,7 +93,7 @@ If one row of `transforms` is `[a0, a1, a2, b0, b1, b2, c0, c1]`, then it maps
 the *output* point `(x, y)` to a transformed *input* point
 `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`, where
 `k = c0 x + c1 y + 1`. If the transformed point lays outside of the input
-image, the output pixel is set to 0. The output is the same size as the input,
+image, the output pixel is set to 0.
 
 images: 4D `Tensor`, input image(s) in NHWC format.
 transforms: 2D `Tensor`, projective transform(s) to apply to the image(s).
diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
index b50177ae56..c0151d320f 100644
--- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
@@ -195,10 +195,40 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
           x_init_value=test_image)
       self.assertLess(left_err, 1e-10)
 
+  def _test_grad_different_shape(self, input_shape, output_shape):
+    with self.test_session():
+      test_image_shape = input_shape
+      test_image = np.random.randn(*test_image_shape)
+      test_image_tensor = constant_op.constant(
+          test_image, shape=test_image_shape)
+      test_transform = image_ops.angles_to_projective_transforms(
+          np.pi / 2, 4, 4)
+
+      if len(output_shape) == 2:
+        resize_shape = output_shape
+      elif len(output_shape) == 3:
+        resize_shape = output_shape[0:2]
+      elif len(output_shape) == 4:
+        resize_shape = output_shape[1:3]
+      output = image_ops.transform(
+          images=test_image_tensor,
+          transforms=test_transform,
+          output_shape=resize_shape)
+      left_err = gradient_checker.compute_gradient_error(
+          test_image_tensor,
+          test_image_shape,
+          output,
+          output_shape,
+          x_init_value=test_image)
+      self.assertLess(left_err, 1e-10)
+
   def test_grad(self):
     self._test_grad([16, 16])
     self._test_grad([4, 12, 12])
     self._test_grad([3, 4, 12, 12])
+    self._test_grad_different_shape([16, 16], [8, 8])
+    self._test_grad_different_shape([4, 12, 3], [8, 24, 3])
+    self._test_grad_different_shape([3, 4, 12, 3], [3, 8, 24, 3])
 
 
 class BipartiteMatchTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index c139ae89d8..a8d8cf8c5c 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -212,7 +212,11 @@ def translations_to_projective_transforms(translations, name=None):
         axis=1)
 
 
-def transform(images, transforms, interpolation="NEAREST", name=None):
+def transform(images,
+              transforms,
+              interpolation="NEAREST",
+              output_shape=None,
+              name=None):
   """Applies the given transform(s) to the image(s).
 
   Args:
@@ -229,6 +233,10 @@ def transform(images, transforms, interpolation="NEAREST", name=None):
        the transform mapping input points to output points. Note that gradients
        are not backpropagated into transformation parameters.
     interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR".
+    output_shape: Output dimesion after the transform, [height, width].
+       If None, output is the same size as input image.
+
+    name: The name of the op.
 
   Returns:
     Image(s) with the same type and shape as `images`, with the given
@@ -255,6 +263,13 @@ def transform(images, transforms, interpolation="NEAREST", name=None):
     else:
       raise TypeError("Images should have rank between 2 and 4.")
 
+    if output_shape is None:
+      output_shape = array_ops.shape(images)[1:3]
+    elif len(output_shape) != 2:
+      raise TypeError(
+          "output_shape must either be None or a vector of 2 elements. %s" %
+          str(output_shape))
+
     if len(transform_or_transforms.get_shape()) == 1:
       transforms = transform_or_transforms[None]
     elif transform_or_transforms.get_shape().ndims is None:
@@ -265,7 +280,7 @@ def transform(images, transforms, interpolation="NEAREST", name=None):
     else:
       raise TypeError("Transforms should have rank 1 or 2.")
     output = gen_image_ops.image_projective_transform(
-        images, transforms, interpolation=interpolation.upper())
+        images, transforms, output_shape, interpolation=interpolation.upper())
     if len(image_or_images.get_shape()) == 2:
       return output[0, :, :, 0]
     elif len(image_or_images.get_shape()) == 3:
@@ -375,14 +390,6 @@ def _image_projective_transform_grad(op, grad):
 
   if image_or_images.dtype.base_dtype not in _IMAGE_DTYPES:
     raise TypeError("Invalid dtype %s." % image_or_images.dtype)
-  if len(image_or_images.get_shape()) == 2:
-    images = image_or_images[None, :, :, None]
-  elif len(image_or_images.get_shape()) == 3:
-    images = image_or_images[None, :, :, :]
-  elif len(image_or_images.get_shape()) == 4:
-    images = image_or_images
-  else:
-    raise TypeError("Images should have rank between 2 and 4")
   if len(transform_or_transforms.get_shape()) == 1:
     transforms = transform_or_transforms[None]
   elif len(transform_or_transforms.get_shape()) == 2:
@@ -395,13 +402,11 @@ def _image_projective_transform_grad(op, grad):
   inverse = linalg_ops.matrix_inverse(transforms)
   transforms = matrices_to_flat_transforms(inverse)
   output = gen_image_ops.image_projective_transform(
-      grad, transforms, interpolation=interpolation)
-  if len(image_or_images.get_shape()) == 2:
-    return [output[0, :, :, 0], None]
-  elif len(image_or_images.get_shape()) == 3:
-    return [output[0, :, :, :], None]
-  else:
-    return [output, None]
+      images=grad,
+      transforms=transforms,
+      output_shape=array_ops.shape(image_or_images)[1:3],
+      interpolation=interpolation)
+  return [output, None, None]
 
 
 def bipartite_match(distance_mat,
-- 
GitLab


From fe4146d884c8805fceaa6d73d0bcc7fbf21df7cd Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sat, 21 Apr 2018 18:42:03 +0000
Subject: [PATCH 274/450] Update .gitignore for cmake generated files

After running cmake on Linux with:
```
tensorflow/tools/ci_build/ci_build.sh CMAKE tensorflow/tools/ci_build/builds/cmake.sh
```

the following file is left:
```
ubuntu@ubuntu:~/tensorflow$ git status
On branch master
Your branch is up-to-date with 'origin/master'.
Untracked files:
  (use "git add <file>..." to include in what will be committed)

        api_init_files_list.txt

nothing added to commit but untracked files present (use "git add" to track)
ubuntu@ubuntu:~/tensorflow$
```

This fix updates the .gitignore file so that cmake generated files
is not added with git inadvertently.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index be75938ec4..828bbe9bd3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,6 +27,7 @@ Podfile.lock
 /tensorflow/contrib/lite/examples/ios/simple/data/*.txt
 /tensorflow/contrib/lite/examples/ios/simple/data/*.tflite
 xcuserdata/**
+/api_init_files_list.txt
 
 # Android
 .gradle
-- 
GitLab


From 8f558d67450f3ec6aa0d96af9fad84042d6b79df Mon Sep 17 00:00:00 2001
From: AG Ramesh <ag.ramesh@intel.com>
Date: Sat, 21 Apr 2018 15:25:37 -0700
Subject: [PATCH 275/450] Changed calls to the depreacted StringPiece::contains
 with str_util::StrContains

---
 tensorflow/core/graph/mkl_layout_pass.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 5368774f2d..72a13d4da7 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -547,14 +547,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     // If Op has been specifically assigned to a non-CPU device, then No.
     if (!n->assigned_device_name().empty() &&
-        !StringPiece(n->assigned_device_name()).contains(kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->assigned_device_name(),kCPUDeviceSubStr)) {
       result = false;
       reason = "Op has been assigned a runtime device that is not CPU.";
     }
 
     // If user has specifically assigned this op to a non-CPU device, then No.
     if (!n->def().device().empty() &&
-        !StringPiece(n->def().device()).contains(kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->def().device(),kCPUDeviceSubStr)) {
       result = false;
       reason = "User has assigned a device that is not CPU.";
     }
@@ -2691,14 +2691,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     // If Op has been specifically assigned to a non-CPU device, then No.
     if (!n->assigned_device_name().empty() &&
-        !StringPiece(n->assigned_device_name()).contains(kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->assigned_device_name(),kCPUDeviceSubStr)) {
       result = false;
       reason = "Op has been assigned a runtime device that is not CPU.";
     }
 
     // If user has specifically assigned this op to a non-CPU device, then No.
     if (!n->def().device().empty() &&
-        !StringPiece(n->def().device()).contains(kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->def().device(),kCPUDeviceSubStr)) {
       result = false;
       reason = "User has assigned a device that is not CPU.";
     }
-- 
GitLab


From 5518db48074c3bd125089bccc3edec03c192bf56 Mon Sep 17 00:00:00 2001
From: Bryan Heden <b.heden@gmail.com>
Date: Sat, 21 Apr 2018 19:45:42 -0500
Subject: [PATCH 276/450] update $ source spacing

When viewing install_linux, the spacing was off for 'Next Steps' section.
---
 tensorflow/docs_src/install/install_linux.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 1a349f5412..02af21bcf2 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -231,7 +231,7 @@ Note that you must activate the Virtualenv environment each time you
 use TensorFlow. If the Virtualenv environment is not currently active,
 invoke one of the following commands:
 
-<pre> $ <b>source ~/tensorflow/bin/activate</b>      # bash, sh, ksh, or zsh
+<pre>$ <b>source ~/tensorflow/bin/activate</b>      # bash, sh, ksh, or zsh
 $ <b>source ~/tensorflow/bin/activate.csh</b>  # csh or tcsh</pre>
 
 When the Virtualenv environment is active, you may run
-- 
GitLab


From 5b7b354efe3eff5756623b04b87b4cd5272f82cc Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Sat, 21 Apr 2018 21:37:48 -0700
Subject: [PATCH 277/450] [XLA] Add an option to the CSE pass to ignore
 non-fusion computations

PiperOrigin-RevId: 193814728
---
 tensorflow/compiler/xla/service/hlo_cse.cc |  4 ++++
 tensorflow/compiler/xla/service/hlo_cse.h  | 11 +++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc
index cd7cbbdd71..3b22c93733 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse.cc
@@ -97,6 +97,10 @@ StatusOr<bool> HloCSE::Run(HloModule* module) {
   const std::function<bool(const HloComputation*, const HloComputation*)>
       eq_computations = std::equal_to<const HloComputation*>();
   for (auto* computation : module->computations()) {
+    if (only_fusion_computations_ && !computation->IsFusionComputation()) {
+      continue;
+    }
+
     changed |= CombineConstants(computation, is_layout_sensitive_);
 
     std::list<HloInstruction*> post_order =
diff --git a/tensorflow/compiler/xla/service/hlo_cse.h b/tensorflow/compiler/xla/service/hlo_cse.h
index 70096e07a2..5e2b348bdd 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.h
+++ b/tensorflow/compiler/xla/service/hlo_cse.h
@@ -29,9 +29,11 @@ class HloCSE : public HloPassInterface {
  public:
   // If is_layout_sensitive is true, then the simplifier preserves layout during
   // transformation. Otherwise, layout is ignored.
-  explicit HloCSE(bool is_layout_sensitive)
-      : is_layout_sensitive_(is_layout_sensitive) {}
-  ~HloCSE() override {}
+  explicit HloCSE(bool is_layout_sensitive,
+                  bool only_fusion_computations = false)
+      : is_layout_sensitive_(is_layout_sensitive),
+        only_fusion_computations_(only_fusion_computations) {}
+  ~HloCSE() override = default;
   tensorflow::StringPiece name() const override { return "cse"; }
 
   // Run CSE on the given module. Returns whether the module was changed (common
@@ -39,7 +41,8 @@ class HloCSE : public HloPassInterface {
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
-  bool is_layout_sensitive_;
+  const bool is_layout_sensitive_;
+  const bool only_fusion_computations_;
 };
 
 }  // namespace xla
-- 
GitLab


From 292d9b92c93e97e98284787a1a60c30553fee5cb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 22 Apr 2018 07:13:16 -0700
Subject: [PATCH 278/450] Fixed typo in crossed column code snippet.

PiperOrigin-RevId: 193838865
---
 tensorflow/docs_src/get_started/feature_columns.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/get_started/feature_columns.md b/tensorflow/docs_src/get_started/feature_columns.md
index d8e4bec863..9c777a0077 100644
--- a/tensorflow/docs_src/get_started/feature_columns.md
+++ b/tensorflow/docs_src/get_started/feature_columns.md
@@ -364,7 +364,7 @@ def make_dataset(latitude, longitude, labels):
     return tf.data.Dataset.from_tensor_slices((features, labels))
 
 
-# Bucketize the latitude and longitude usig the `edges`
+# Bucketize the latitude and longitude using the `edges`
 latitude_bucket_fc = tf.feature_column.bucketized_column(
     tf.feature_column.numeric_column('latitude'),
     list(atlanta.latitude.edges))
-- 
GitLab


From e1722aa3197b3942add6b9fb78ed50e21af693ff Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 22 Apr 2018 07:29:33 -0700
Subject: [PATCH 279/450] Multi-thread implementation of
 ExperimentalShuffledFullyConnected using the gemmlowp threadpool.

PiperOrigin-RevId: 193839485
---
 .../internal/optimized/optimized_ops.h        | 146 +++++++++++++-----
 1 file changed, 111 insertions(+), 35 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index d269056800..2e2721e093 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -1203,39 +1203,16 @@ void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
                  output_activation_max, output_data, output_dims, gemm_context);
 }
 
-inline void ExperimentalShuffledFullyConnected(
-    const uint8* input_data, const Dims<4>& input_dims,
-    const uint8* shuffled_weights_data, const Dims<4>& weights_dims,
-    const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier,
-    int output_shift, int32 output_activation_min, int32 output_activation_max,
-    int16* output_data, const Dims<4>& output_dims,
-    gemmlowp::GemmContext* gemm_context) {
-  gemmlowp::ScopedProfilingLabel label(
-      "ExperimentalShuffledFullyConnected/8bit");
-  (void)gemm_context;  // only used in optimized code.
-  TFLITE_DCHECK_EQ(output_activation_min, -32768);
-  TFLITE_DCHECK_EQ(output_activation_max, 32767);
-  // TODO(benoitjacob): This really should be:
-  //     const int batches = ArraySize(output_dims, 1);
-  // but the current --variable_batch hack consists in overwriting the 3rd
-  // dimension with the runtime batch size, as we don't keep track for each
-  // array of which dimension is the batch dimension in it.
-  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
-                      ArraySize(output_dims, 3);
-  const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0);
-  const int accum_depth = ArraySize(weights_dims, 0);
-  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
-  // The experimental shuffling is an optimization for matrix*vector product.
-  // We aren't interested in supporting non-matrix*vector-product cases, i.e.
-  // batches>1.
-  TFLITE_DCHECK_EQ(batches, 1);
-  // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
-  // so that just reinterpreting them as int8 values is equivalent to
-  // subtracting 128 from them, thus implementing for free the subtraction of
-  // the zero_point value 128.
-  const int8* shuffled_weights_ptr =
-      reinterpret_cast<const int8*>(shuffled_weights_data);
+// Internal function doing the actual arithmetic work for
+// ExperimentalShuffledFullyConnected.
+// May be called either directly by it (single-threaded case) or may be used
+// as the 'task' for worker threads to run (multi-threaded case, see
+// ExperimentalShuffledFullyConnectedWorkerTask below).
+inline void ExperimentalShuffledFullyConnectedWorkerImpl(
+    const uint8* input_data, const int8* shuffled_weights_data,
+    int output_depth, int accum_depth, const int32* bias_data,
+    int32 output_multiplier, int output_shift, int16* output_data) {
+  const int8* shuffled_weights_ptr = shuffled_weights_data;
 #if defined USE_NEON
   // We'll only need to xor signbit to the input activation values, as
   // that xor-ing is pre-built into the shuffled weights values.
@@ -1331,14 +1308,113 @@ inline void ExperimentalShuffledFullyConnected(
       acc =
           MultiplyByQuantizedMultiplier(acc, output_multiplier, -output_shift);
       // Saturate, cast to int16, and store to output array.
-      acc = std::max(acc, output_activation_min);
-      acc = std::min(acc, output_activation_max);
+      acc = std::max(acc, -32768);
+      acc = std::min(acc, 32767);
       output_data[c + i] = acc;
     }
   }
 #endif
 }
 
+// Wraps ExperimentalShuffledFullyConnectedWorkerImpl into a Task class
+// to allow using gemmlowp's threadpool.
+struct ExperimentalShuffledFullyConnectedWorkerTask : gemmlowp::Task {
+  ExperimentalShuffledFullyConnectedWorkerTask(
+      const uint8* input_data, const int8* shuffled_weights_data,
+      int output_depth, int accum_depth, const int32* bias_data,
+      int32 output_multiplier, int output_shift, int16* output_data)
+      : input_data_(input_data),
+        shuffled_weights_data_(shuffled_weights_data),
+        output_depth_(output_depth),
+        accum_depth_(accum_depth),
+        bias_data_(bias_data),
+        output_multiplier_(output_multiplier),
+        output_shift_(output_shift),
+        output_data_(output_data) {}
+
+  void Run() override {
+    ExperimentalShuffledFullyConnectedWorkerImpl(
+        input_data_, shuffled_weights_data_, output_depth_, accum_depth_,
+        bias_data_, output_multiplier_, output_shift_, output_data_);
+  }
+
+  const uint8* input_data_;
+  const int8* shuffled_weights_data_;
+  int output_depth_;
+  int accum_depth_;
+  const int32* bias_data_;
+  int32 output_multiplier_;
+  int output_shift_;
+  int16* output_data_;
+};
+
+inline void ExperimentalShuffledFullyConnected(
+    const uint8* input_data, const Dims<4>& input_dims,
+    const uint8* shuffled_weights_data, const Dims<4>& weights_dims,
+    const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    int16* output_data, const Dims<4>& output_dims,
+    gemmlowp::GemmContext* gemm_context) {
+  gemmlowp::ScopedProfilingLabel label(
+      "ExperimentalShuffledFullyConnected/8bit");
+  (void)gemm_context;  // only used in optimized code.
+  TFLITE_DCHECK_EQ(output_activation_min, -32768);
+  TFLITE_DCHECK_EQ(output_activation_max, 32767);
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                      ArraySize(output_dims, 3);
+  const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0);
+  const int accum_depth = ArraySize(weights_dims, 0);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
+  // The experimental shuffling is an optimization for matrix*vector product.
+  // We aren't interested in supporting non-matrix*vector-product cases, i.e.
+  // batches>1.
+  TFLITE_DCHECK_EQ(batches, 1);
+  // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+  // so that just reinterpreting them as int8 values is equivalent to
+  // subtracting 128 from them, thus implementing for free the subtraction of
+  // the zero_point value 128.
+  const int8* int8_shuffled_weights_data =
+      reinterpret_cast<const int8*>(shuffled_weights_data);
+
+  // Our GEMV kernel has 4 rows. This doesn't matter in practice for GEMV
+  // shapes, gemmlowp::HowManyThreads only takes that parameter because it
+  // matters for other kinds of GEMM shapes.
+  static constexpr int kKernelRows = 4;
+  const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
+      gemm_context->max_num_threads(), output_depth, 1, accum_depth);
+  if (thread_count == 1) {
+    // Single-thread case: do the computation on the current thread, don't
+    // use a threadpool
+    ExperimentalShuffledFullyConnectedWorkerImpl(
+        input_data, int8_shuffled_weights_data, output_depth, accum_depth,
+        bias_data, output_multiplier, output_shift, output_data);
+    return;
+  }
+
+  // Multi-threaded case: use the gemmlowp context's threadpool.
+  TFLITE_DCHECK_GT(thread_count, 1);
+  std::vector<gemmlowp::Task*> tasks(thread_count);
+  const int kRowsPerWorker =
+      gemmlowp::RoundUp<kKernelRows>(output_depth / thread_count);
+  int row_start = 0;
+  for (int i = 0; i < thread_count; i++) {
+    int row_end = std::min(output_depth, row_start + kRowsPerWorker);
+    tasks[i] = new ExperimentalShuffledFullyConnectedWorkerTask(
+        input_data, int8_shuffled_weights_data + row_start * accum_depth,
+        row_end - row_start, accum_depth, bias_data + row_start,
+        output_multiplier, output_shift, output_data + row_start);
+    row_start = row_end;
+  }
+  TFLITE_DCHECK_EQ(row_start, output_depth);
+  gemm_context->workers_pool()->Execute(tasks);
+}
+
 template <typename T>
 inline void ExtractPatchIntoBufferColumn(
     const Dims<4>& input_dims, int w, int h, int b, int kheight, int kwidth,
-- 
GitLab


From bfffd2041106dac5b7bb3efcbb311a20505ac61f Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 14:43:21 +0000
Subject: [PATCH 280/450] Update docs to add note and examples for
 tf.count_nonzero with string

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 31ce83905b..30ac001c25 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1466,9 +1466,18 @@ def count_nonzero(input_tensor,
   tf.count_nonzero(x, [0, 1])  # 3
   ```
 
+  **NOTE** Strings are compared against zero-length empty string `""`. Any
+  string with a size greater than zero is already considered as nonzero.
+
+  For example:
+  ```python
+  x = tf.constant(["", "a", "  ", "b", ""])
+  tf.count_nonzero(x) # 3, with "a", "  ", and "b" as nonzero strings.
+  ```
+
   Args:
-    input_tensor: The tensor to reduce. Should be of numeric type, `string`,
-      or `bool`.
+    input_tensor: The tensor to reduce. Should be of numeric type, `bool`,
+      or `string`.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-- 
GitLab


From 522e20ef9cff8a7a49322c6442d940aa556222c0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 22 Apr 2018 09:15:38 -0700
Subject: [PATCH 281/450] Change refs/unrefs of FLR.

PiperOrigin-RevId: 193843055
---
 tensorflow/core/common_runtime/function.cc    | 52 ++++++++++---------
 .../core/common_runtime/function_test.cc      | 27 ++--------
 .../function_threadpool_test.cc               | 14 +----
 .../process_function_library_runtime.cc       | 21 +-------
 .../process_function_library_runtime.h        |  3 --
 .../process_function_library_runtime_test.cc  | 10 ++--
 6 files changed, 38 insertions(+), 89 deletions(-)

diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index d310520ebd..a6f637b488 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -209,6 +209,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   // The instantiated and transformed function is encoded as a Graph
   // object, and an executor is created for the graph.
   struct Item : public core::RefCounted {
+    bool invalidated = false;
     const Graph* graph = nullptr;                            // Owned by exec.
     const FunctionLibraryDefinition* overlay_lib = nullptr;  // Not owned.
     FunctionBody* func_graph = nullptr;
@@ -284,15 +285,7 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
 }
 
 FunctionLibraryRuntimeImpl::~FunctionLibraryRuntimeImpl() {
-  // The most common patterns of FLR usage don't require the caller to
-  // explicitly release handles. As a result, we try to unref each item until
-  // it's erased.
-  for (auto item : items_) {
-    if (item.second) {
-      while (!item.second->Unref()) {
-      }
-    }
-  }
+  for (auto p : items_) p.second->Unref();
 }
 
 // An asynchronous op kernel which executes an instantiated function
@@ -497,24 +490,30 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
   options_copy.target = device_name_;
   const string key = Canonicalize(function_name, attrs, options_copy);
 
+  Handle found_handle = kInvalidHandle;
   {
     mutex_lock l(mu_);
-    *handle = parent_->GetHandle(key);
-    if (*handle != kInvalidHandle) {
+    found_handle = parent_->GetHandle(key);
+    if (found_handle != kInvalidHandle) {
       FunctionLibraryRuntime::LocalHandle handle_on_device =
-          parent_->GetHandleOnDevice(device_name_, *handle);
+          parent_->GetHandleOnDevice(device_name_, found_handle);
       if (handle_on_device == kInvalidLocalHandle) {
         return errors::Internal("LocalHandle not found for handle ", *handle,
                                 ".");
       }
-      auto item_handle = items_.find(handle_on_device);
-      if (item_handle == items_.end()) {
+      auto iter = items_.find(handle_on_device);
+      if (iter == items_.end()) {
         return errors::Internal("LocalHandle ", handle_on_device,
-                                " for handle ", *handle,
+                                " for handle ", found_handle,
                                 " not found in items.");
       }
-      item_handle->second->Ref();
-      return Status::OK();
+      Item* item = iter->second;
+      if (!item->invalidated) {
+        *handle = found_handle;
+        return Status::OK();
+      }
+      // *item is invalidated. Fall through and instantiate the given
+      // function_name/attrs/option again.
     }
   }
 
@@ -546,10 +545,10 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
 
   {
     mutex_lock l(mu_);
-    *handle = parent_->GetHandle(key);
-    if (*handle != kInvalidHandle) {
+    Handle found_handle_again = parent_->GetHandle(key);
+    if (found_handle_again != found_handle) {
       delete fbody;
-      items_[parent_->GetHandleOnDevice(device_name_, *handle)]->Ref();
+      *handle = found_handle_again;
     } else {
       *handle = parent_->AddHandle(key, device_name_, next_handle_);
       Item* item = new Item;
@@ -566,16 +565,12 @@ Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
   if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
     return parent_->ReleaseHandle(handle);
   }
-
   LocalHandle h = parent_->GetHandleOnDevice(device_name_, handle);
   CHECK_NE(h, kInvalidLocalHandle);
   mutex_lock l(mu_);
   CHECK_EQ(1, items_.count(h));
   Item* item = items_[h];
-  if (item->Unref()) {
-    items_.erase(h);
-    TF_RETURN_IF_ERROR(parent_->RemoveHandle(handle));
-  }
+  item->invalidated = true;  // Reinstantiate later.
   return Status::OK();
 }
 
@@ -736,6 +731,7 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
   // computation is done and stored in *rets, we send the return values back
   // to the source_device (caller) so that the ProcFLR can receive them later.
   std::vector<Tensor>* remote_args = new std::vector<Tensor>;
+  item->Ref();
   ProcessFunctionLibraryRuntime::ReceiveTensorsAsync(
       source_device, target_device, "arg_", src_incarnation, args.size(),
       device_context, {}, rendezvous, remote_args,
@@ -747,6 +743,7 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
           s = frame->SetArgs(*remote_args);
         }
         if (!s.ok()) {
+          item->Unref();
           delete frame;
           delete remote_args;
           delete exec_args;
@@ -757,6 +754,7 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
             *exec_args, [item, frame, rets, done, source_device, target_device,
                          target_incarnation, rendezvous, device_context,
                          remote_args, exec_args](const Status& status) {
+              core::ScopedUnref unref(item);
               Status s = status;
               if (s.ok()) {
                 s = frame->ConsumeRetvals(rets);
@@ -842,11 +840,13 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     return;
   }
 
+  item->Ref();
   item->exec->RunAsync(
       // Executor args
       *exec_args,
       // Done callback.
       [item, frame, rets, done, exec_args](const Status& status) {
+        core::ScopedUnref unref(item);
         Status s = status;
         if (s.ok()) {
           s = frame->ConsumeRetvals(rets);
@@ -906,6 +906,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   exec_args->runner = *run_opts.runner;
   exec_args->call_frame = frame;
 
+  item->Ref();
   item->exec->RunAsync(
       // Executor args
       *exec_args,
@@ -914,6 +915,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
           [item, frame, exec_args](DoneCallback done,
                                    // Start unbound arguments.
                                    const Status& status) {
+            core::ScopedUnref unref(item);
             delete exec_args;
             done(status);
           },
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 61b2f0e60f..373fc64007 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -231,19 +231,8 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       return status;
     }
     FunctionLibraryRuntime::Options opts;
-    status = Run(flr, handle, opts, args, rets, add_runner);
-    if (!status.ok()) return status;
-
-    // Release the handle and try running again. It should not succeed.
-    status = flr->ReleaseHandle(handle);
-    if (!status.ok()) return status;
-
-    Status status2 = Run(flr, handle, opts, args, std::move(rets));
-    EXPECT_TRUE(errors::IsInvalidArgument(status2));
-    EXPECT_TRUE(
-        str_util::StrContains(status2.error_message(), "remote execution."));
-
-    return status;
+    TF_RETURN_IF_ERROR(Run(flr, handle, opts, args, rets, add_runner));
+    return flr->ReleaseHandle(handle);
   }
 
   Status Run(FunctionLibraryRuntime* flr, FunctionLibraryRuntime::Handle handle,
@@ -304,16 +293,8 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       *rets[i] = retvals[i];
     }
 
-    // Release the handle and try running again. It should not succeed.
-    status = flr->ReleaseHandle(handle);
-    if (!status.ok()) return status;
-
-    Status status2 = Run(flr, handle, opts, args, std::move(rets));
-    EXPECT_TRUE(errors::IsInvalidArgument(status2));
-    EXPECT_TRUE(
-        str_util::StrContains(status2.error_message(), "remote execution."));
-
-    return status;
+    // Release the handle.
+    return flr->ReleaseHandle(handle);
   }
 
   std::unique_ptr<Graph> GetFuncBody(FunctionLibraryRuntime* flr,
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index 2d09e83d01..98dac38a8c 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -144,19 +144,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       return status;
     }
     FunctionLibraryRuntime::Options opts;
-    status = Run(flr, handle, opts, args, rets, add_runner);
-    if (!status.ok()) return status;
-
-    // Release the handle and try running again. It should not succeed.
-    status = flr->ReleaseHandle(handle);
-    if (!status.ok()) return status;
-
-    Status status2 = Run(flr, handle, opts, args, std::move(rets));
-    EXPECT_TRUE(errors::IsInvalidArgument(status2));
-    EXPECT_TRUE(
-        str_util::StrContains(status2.error_message(), "remote execution."));
-
-    return status;
+    return Run(flr, handle, opts, args, std::move(rets), add_runner);
   }
 
   Status Run(FunctionLibraryRuntime* flr, FunctionLibraryRuntime::Handle handle,
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index d05f146f21..e61ed8c479 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -181,12 +181,7 @@ FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandle(
     const string& function_key, const string& device_name,
     FunctionLibraryRuntime::LocalHandle local_handle) {
   mutex_lock l(mu_);
-  FunctionLibraryRuntime::Handle h =
-      gtl::FindWithDefault(table_, function_key, kInvalidHandle);
-  if (h != kInvalidHandle) {
-    if (function_data_.count(h) != 0) return h;
-  }
-  h = next_handle_;
+  auto h = next_handle_;
   FunctionData* fd = new FunctionData(device_name, local_handle);
   function_data_[h] = std::unique_ptr<FunctionData>(fd);
   table_[function_key] = h;
@@ -197,12 +192,7 @@ FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandle(
 FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::GetHandle(
     const string& function_key) const {
   mutex_lock l(mu_);
-  FunctionLibraryRuntime::Handle h =
-      gtl::FindWithDefault(table_, function_key, kInvalidHandle);
-  if (h != kInvalidHandle) {
-    if (function_data_.count(h) == 0) return kInvalidHandle;
-  }
-  return h;
+  return gtl::FindWithDefault(table_, function_key, kInvalidHandle);
 }
 
 bool ProcessFunctionLibraryRuntime::IsInstantiatedOnDevice(
@@ -272,13 +262,6 @@ Status ProcessFunctionLibraryRuntime::Instantiate(
   return Status::OK();
 }
 
-Status ProcessFunctionLibraryRuntime::RemoveHandle(
-    FunctionLibraryRuntime::Handle handle) {
-  mutex_lock l(mu_);
-  function_data_.erase(handle);
-  return Status::OK();
-}
-
 Status ProcessFunctionLibraryRuntime::ReleaseHandle(
     FunctionLibraryRuntime::Handle handle) {
   FunctionLibraryRuntime* flr = nullptr;
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index c7b8259f78..05e5770899 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -134,9 +134,6 @@ class ProcessFunctionLibraryRuntime {
   // of the device where the function is registered.
   string GetDeviceName(FunctionLibraryRuntime::Handle handle);
 
-  // Removes handle from the state owned by this object.
-  Status RemoveHandle(FunctionLibraryRuntime::Handle handle);
-
   Status Clone(Env* env, int graph_def_version,
                const OptimizerOptions& optimizer_options,
                CustomKernelCreator custom_kernel_creator,
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 4fbf2abc67..cc10e77ad2 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -119,12 +119,13 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
 
     EXPECT_GE(call_count, 1);  // Test runner is used.
 
-    // Release the handle and then try running the function. It shouldn't
-    // succeed.
+    // Release the handle and then try running the function.  It
+    // should still succeed.
     status = proc_flr_->ReleaseHandle(handle);
     if (!status.ok()) {
       return status;
     }
+
     Notification done2;
     proc_flr_->Run(opts, handle, args, &out,
                    [&status, &done2](const Status& s) {
@@ -132,10 +133,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
                      done2.Notify();
                    });
     done2.WaitForNotification();
-    EXPECT_TRUE(errors::IsNotFound(status));
-    EXPECT_TRUE(str_util::StrContains(status.error_message(), "not found."));
-
-    return Status::OK();
+    return status;
   }
 
   std::vector<Device*> devices_;
-- 
GitLab


From d481f07549470b4a03b41f9bb588d7f7ddc85082 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Sun, 22 Apr 2018 09:26:15 -0700
Subject: [PATCH 282/450] Remove proto header include in core/kernels.

The goal is to make kernels mostly independent of proto headers, which will let us lock down our .so import

PiperOrigin-RevId: 193843351
---
 .../remote_fused_graph_execute_info.proto     |  8 ----
 tensorflow/core/kernels/BUILD                 |  1 +
 .../hexagon/hexagon_control_wrapper.cc        |  1 +
 .../hexagon/hexagon_graph_execution_test.cc   |  1 +
 .../kernels/i_remote_fused_graph_executor.h   |  4 +-
 .../remote_fused_graph_execute_utils.cc       | 46 +++++++++----------
 .../remote_fused_graph_execute_utils.h        | 28 +++++++----
 .../remote_fused_graph_execute_utils_test.cc  |  1 +
 ...ote_fused_graph_rewriter_transform_test.cc |  1 +
 tensorflow/core/kernels/summary_interface.h   |  5 +-
 tensorflow/core/kernels/summary_kernels.cc    |  1 +
 11 files changed, 52 insertions(+), 45 deletions(-)

diff --git a/tensorflow/core/framework/remote_fused_graph_execute_info.proto b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
index 389a08ac2f..946da40d0e 100644
--- a/tensorflow/core/framework/remote_fused_graph_execute_info.proto
+++ b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
@@ -14,14 +14,6 @@ import "tensorflow/core/framework/types.proto";
 // not valid across executions, but can be serialized back and forth from within
 // a single run.
 message RemoteFusedGraphExecuteInfo {
-  enum NodeType {
-    UNUSED = 0;
-    GRAPH_INPUT = 1;
-    GRAPH_OUTPUT = 2;
-    FUSED_NODE = 3;
-    BORDER_INPUT = 4;
-    BORDER_OUTPUT = 5;
-  }
 
   message TensorShapeTypeProto {
     DataType dtype = 1;
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 7ef15da143..f7f6a9b505 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -5925,6 +5925,7 @@ tf_cc_test(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
index 66d24d171d..3810cbe5b5 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h"
 
 #include "tensorflow/core/framework/graph_transfer_info.pb.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
 #include "tensorflow/core/kernels/hexagon/soc_interface.h"
diff --git a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
index 5fb6b9247f..d53977703e 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
@@ -30,6 +30,7 @@ adb push /tmp/imagenet_comp_graph_label_strings.txt /data/local/tmp
 #include <memory>
 
 #include "tensorflow/core/framework/graph_transfer_info.pb.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/kernels/hexagon/graph_transfer_utils.h"
diff --git a/tensorflow/core/kernels/i_remote_fused_graph_executor.h b/tensorflow/core/kernels/i_remote_fused_graph_executor.h
index eb6b64da58..6072412689 100644
--- a/tensorflow/core/kernels/i_remote_fused_graph_executor.h
+++ b/tensorflow/core/kernels/i_remote_fused_graph_executor.h
@@ -16,13 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
 #define TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
 
-#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 
+class GraphDef;
+class RemoteFusedGraphExecuteInfo;
+
 class IRemoteFusedGraphExecutor {
  public:
   using TensorAllocatorFunc = std::function<Tensor*(const TensorShape& shape)>;
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
index e2709c117d..cc4d9a49a0 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -1125,46 +1127,43 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
     for (size_t i = 0; i < inputs.size(); ++i) {
       if (IsSameNodeName(node_def, inputs.at(i), &tid)) {
         AppendDeliminator(&attr_str);
-        attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::GRAPH_INPUT,
-                                      tid.second, i, remote_graph_executor_name,
+        attr_str += BuildNodeTypeAttr(GRAPH_INPUT, tid.second, i,
+                                      remote_graph_executor_name,
                                       remote_fused_graph_node_name);
       }
     }
     for (size_t i = 0; i < outputs.size(); ++i) {
       if (IsSameNodeName(node_def, outputs.at(i), &tid)) {
         AppendDeliminator(&attr_str);
-        attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::GRAPH_OUTPUT,
-                                      tid.second, i);
+        attr_str += BuildNodeTypeAttr(GRAPH_OUTPUT, tid.second, i);
       }
     }
     for (const string& fused_node_name : fused_node_names) {
       if (fused_node_name == node_def.name()) {
         AppendDeliminator(&attr_str);
-        attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::FUSED_NODE);
+        attr_str += BuildNodeTypeAttr(FUSED_NODE);
       }
     }
     for (const string& fused_node_name : fused_nodes_filtered_by_op_types) {
       if (fused_node_name == node_def.name()) {
         AppendDeliminator(&attr_str);
-        attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::FUSED_NODE);
+        attr_str += BuildNodeTypeAttr(FUSED_NODE);
       }
     }
     for (size_t i = 0; i < border_inputs.size(); ++i) {
       if (IsSameNodeName(node_def, border_inputs.at(i), &tid)) {
         AppendDeliminator(&attr_str);
-        attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::BORDER_INPUT,
-                                      tid.second, i);
+        attr_str += BuildNodeTypeAttr(BORDER_INPUT, tid.second, i);
       }
     }
     for (size_t i = 0; i < border_outputs.size(); ++i) {
       if (IsSameNodeName(node_def, border_outputs.at(i), &tid)) {
         AppendDeliminator(&attr_str);
-        attr_str += BuildNodeTypeAttr(
-            RemoteFusedGraphExecuteInfo::BORDER_OUTPUT, tid.second, i);
+        attr_str += BuildNodeTypeAttr(BORDER_OUTPUT, tid.second, i);
       }
     }
     if (attr_str.empty()) {
-      attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::UNUSED);
+      attr_str += BuildNodeTypeAttr(UNUSED);
     }
     AddNodeAttr(ATTR_NODE_TYPE, attr_str, &node_def);
   }
@@ -1200,14 +1199,14 @@ RemoteFusedGraphExecuteUtils::FuseRemoteGraphByPlacedArguments(
       }
       int node_type_int;
       CHECK(strings::safe_strto32(attr.at(0), &node_type_int)) << attr.at(0);
-      const RemoteFusedGraphExecuteInfo::NodeType node_type =
-          static_cast<RemoteFusedGraphExecuteInfo::NodeType>(node_type_int);
+      const RemoteFusedGraphNodeType node_type =
+          static_cast<RemoteFusedGraphNodeType>(node_type_int);
       const string& name = node_def.name();
       int port;
       int index;
 
       switch (node_type) {
-        case RemoteFusedGraphExecuteInfo::GRAPH_INPUT:
+        case GRAPH_INPUT:
           VLOG(2) << "Graph input: " << name;
           CHECK_EQ(5, attr.size());
           CHECK(strings::safe_strto32(attr.at(1), &port));
@@ -1224,33 +1223,33 @@ RemoteFusedGraphExecuteUtils::FuseRemoteGraphByPlacedArguments(
             return Status::OK();
           }
           break;
-        case RemoteFusedGraphExecuteInfo::GRAPH_OUTPUT:
+        case GRAPH_OUTPUT:
           VLOG(2) << "Graph output: " << name;
           CHECK_EQ(3, attr.size());
           CHECK(strings::safe_strto32(attr.at(1), &port));
           CHECK(strings::safe_strto32(attr.at(2), &index));
           output_map.emplace(index, strings::StrCat(name, ":", port));
           break;
-        case RemoteFusedGraphExecuteInfo::FUSED_NODE:
+        case FUSED_NODE:
           VLOG(2) << "Fused node: " << name;
           CHECK_EQ(1, attr.size());
           fused_node_names.emplace(name);
           break;
-        case RemoteFusedGraphExecuteInfo::BORDER_INPUT:
+        case BORDER_INPUT:
           VLOG(2) << "Border input: " << name;
           CHECK_EQ(3, attr.size());
           CHECK(strings::safe_strto32(attr.at(1), &port));
           CHECK(strings::safe_strto32(attr.at(2), &index));
           border_input_map.emplace(index, strings::StrCat(name, ":", port));
           break;
-        case RemoteFusedGraphExecuteInfo::BORDER_OUTPUT:
+        case BORDER_OUTPUT:
           VLOG(2) << "Border output: " << name;
           CHECK_EQ(3, attr.size());
           CHECK(strings::safe_strto32(attr.at(1), &port));
           CHECK(strings::safe_strto32(attr.at(2), &index));
           border_output_map.emplace(index, strings::StrCat(name, ":", port));
           break;
-        case RemoteFusedGraphExecuteInfo::UNUSED:
+        case UNUSED:
           // do nothing
           break;
         default:
@@ -1461,20 +1460,19 @@ RemoteFusedGraphExecuteUtils::BuildNodeMapFromOpsDefinitions(
 }
 
 /* static */ string RemoteFusedGraphExecuteUtils::BuildNodeTypeAttr(
-    const RemoteFusedGraphExecuteInfo::NodeType node_type, const int port,
-    const int index, const string& executor_name, const string& node_name) {
+    const RemoteFusedGraphNodeType node_type, const int port, const int index,
+    const string& executor_name, const string& node_name) {
   return strings::StrCat(static_cast<int>(node_type), ",", port, ",", index,
                          ",", executor_name, ",", node_name);
 }
 
 /* static */ string RemoteFusedGraphExecuteUtils::BuildNodeTypeAttr(
-    const RemoteFusedGraphExecuteInfo::NodeType node_type, const int port,
-    const int index) {
+    const RemoteFusedGraphNodeType node_type, const int port, const int index) {
   return strings::StrCat(static_cast<int>(node_type), ",", port, ",", index);
 }
 
 /* static */ string RemoteFusedGraphExecuteUtils::BuildNodeTypeAttr(
-    const RemoteFusedGraphExecuteInfo::NodeType node_type) {
+    const RemoteFusedGraphNodeType node_type) {
   return strings::StrCat(static_cast<int>(node_type));
 }
 
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
index f047144278..ea6b6a1015 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
@@ -19,8 +19,6 @@ limitations under the License.
 #include <unordered_map>
 #include <unordered_set>
 
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/kernels/i_remote_fused_graph_executor.h"
@@ -30,6 +28,17 @@ limitations under the License.
 
 namespace tensorflow {
 
+enum RemoteFusedGraphNodeType {
+  UNUSED = 0,
+  GRAPH_INPUT = 1,
+  GRAPH_OUTPUT = 2,
+  FUSED_NODE = 3,
+  BORDER_INPUT = 4,
+  BORDER_OUTPUT = 5,
+};
+
+class RemoteFusedGraphExecuteInfo;
+
 // RemoteFusedGraphExecuteUtils provides APIs to register and get builder
 // functions for IRemoteFusedGraphExecutor.
 class RemoteFusedGraphExecuteUtils {
@@ -297,16 +306,15 @@ class RemoteFusedGraphExecuteUtils {
 
   static ExecutorBuildRegistry* GetExecutorBuildRegistry();
 
-  static string BuildNodeTypeAttr(
-      const RemoteFusedGraphExecuteInfo::NodeType node_type, const int port,
-      const int index, const string& executor_name, const string& node_name);
+  static string BuildNodeTypeAttr(const RemoteFusedGraphNodeType node_type,
+                                  const int port, const int index,
+                                  const string& executor_name,
+                                  const string& node_name);
 
-  static string BuildNodeTypeAttr(
-      const RemoteFusedGraphExecuteInfo::NodeType node_type, const int port,
-      const int index);
+  static string BuildNodeTypeAttr(const RemoteFusedGraphNodeType node_type,
+                                  const int port, const int index);
 
-  static string BuildNodeTypeAttr(
-      const RemoteFusedGraphExecuteInfo::NodeType node_type);
+  static string BuildNodeTypeAttr(const RemoteFusedGraphNodeType node_type);
 
   TF_DISALLOW_COPY_AND_ASSIGN(RemoteFusedGraphExecuteUtils);
 };
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc b/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc
index aca8ddfae9..44251e6ff8 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
index 9217c25978..1e0731e540 100644
--- a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/nn_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/graph/default_device.h"
diff --git a/tensorflow/core/kernels/summary_interface.h b/tensorflow/core/kernels/summary_interface.h
index 02391e967a..1854fe5526 100644
--- a/tensorflow/core/kernels/summary_interface.h
+++ b/tensorflow/core/kernels/summary_interface.h
@@ -17,14 +17,15 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/event.pb.h"
 
 namespace tensorflow {
 
+class Event;
+class GraphDef;
+
 // Main interface for the summary writer resource.
 class SummaryWriterInterface : public ResourceBase {
  public:
diff --git a/tensorflow/core/kernels/summary_kernels.cc b/tensorflow/core/kernels/summary_kernels.cc
index d317a8d33d..b287f0cc2f 100644
--- a/tensorflow/core/kernels/summary_kernels.cc
+++ b/tensorflow/core/kernels/summary_kernels.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/db/sqlite.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/util/event.pb.h"
 
 namespace tensorflow {
 
-- 
GitLab


From 21bd19a8b8b0be8ac4d39b6bc32366ba908f5105 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 17:49:13 +0000
Subject: [PATCH 283/450] Change from squeeze_dims to axis when calling
 tf.squeeze

The `squeeze_dims` in `tf.squeeze` has been deprecated in favor
of `axis` while many places still use `squeeze_dims`. That
generates lots of warnings.

This fix switches from `squeeze_dims` to `axis` to remove those warnings.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/array_grad.py | 2 +-
 tensorflow/python/ops/array_ops.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 57d2657838..3678bd4c1f 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -196,7 +196,7 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
             array_ops.where(
                 math_ops.logical_and(grad.indices >= start,
                                      grad.indices < end)),
-            squeeze_dims=[1])
+            axis=[1])
         new_indices = array_ops.gather(grad.indices, indices_to_select) - start
         new_values = array_ops.gather(grad.values, indices_to_select)
         out_grads.append(ops.IndexedSlices(new_values, new_indices, size))
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 23202ae28e..bbffff0483 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1230,7 +1230,7 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
 
   def _apply_mask_1d(reshaped_tensor, mask, axis=None):
     """Mask tensor along dimension 0 with a 1-D mask."""
-    indices = squeeze(where(mask), squeeze_dims=[1])
+    indices = squeeze(where(mask), axis=[1])
     return gather(reshaped_tensor, indices, axis=axis)
 
   with ops.name_scope(name, values=[tensor, mask]):
-- 
GitLab


From 100b6000d4d04a344a1516578f724e46cdede5e1 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 17:52:31 +0000
Subject: [PATCH 284/450] Fix warning in image related ops.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/image_ops_impl.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 601010bce9..bd5b2ae83b 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -652,7 +652,7 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
     padded.set_shape(padded_shape)
 
     if not is_batch:
-      padded = array_ops.squeeze(padded, squeeze_dims=[0])
+      padded = array_ops.squeeze(padded, axis=[0])
 
     return padded
 
@@ -732,7 +732,7 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
     cropped.set_shape(cropped_shape)
 
     if not is_batch:
-      cropped = array_ops.squeeze(cropped, squeeze_dims=[0])
+      cropped = array_ops.squeeze(cropped, axis=[0])
 
     return cropped
 
@@ -849,7 +849,7 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
     resized = control_flow_ops.with_dependencies(assert_ops, resized)
 
     if not is_batch:
-      resized = array_ops.squeeze(resized, squeeze_dims=[0])
+      resized = array_ops.squeeze(resized, axis=[0])
 
     return resized
 
@@ -942,7 +942,7 @@ def resize_images(images,
            for x in [new_width_const, width, new_height_const, height]) and (
                width == new_width_const and height == new_height_const):
       if not is_batch:
-        images = array_ops.squeeze(images, squeeze_dims=[0])
+        images = array_ops.squeeze(images, axis=[0])
       return images
 
     if method == ResizeMethod.BILINEAR:
@@ -965,7 +965,7 @@ def resize_images(images,
     images.set_shape([None, new_height_const, new_width_const, None])
 
     if not is_batch:
-      images = array_ops.squeeze(images, squeeze_dims=[0])
+      images = array_ops.squeeze(images, axis=[0])
     return images
 
 
-- 
GitLab


From 8cdc752227af998da946decc9365d63bcaa7f184 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 17:53:10 +0000
Subject: [PATCH 285/450] Fix warning in tf.nn ops where squeeze_dims was used
 with tf.squeeze

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/nn_impl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index d0d5ed07ce..576627e78e 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -765,9 +765,9 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
     weighted_variance = math_ops.multiply(weighted_distsq, divisor)
 
     if not keep_dims:
-      weighted_mean = array_ops.squeeze(weighted_mean, squeeze_dims=axes)
+      weighted_mean = array_ops.squeeze(weighted_mean, axis=axes)
       weighted_variance = array_ops.squeeze(
-          weighted_variance, squeeze_dims=axes)
+          weighted_variance, axis=axes)
 
     if needs_cast:
       weighted_mean = math_ops.cast(weighted_mean, dtypes.float16)
-- 
GitLab


From 12fd64f72f59ff5ba114903d4b851f855aaf2458 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 17:53:58 +0000
Subject: [PATCH 286/450] Fix warnings in reduce_join_op_test.py

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/reduce_join_op_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/reduce_join_op_test.py b/tensorflow/python/kernel_tests/reduce_join_op_test.py
index 7f3049b9f8..fb9e5cc2a3 100644
--- a/tensorflow/python/kernel_tests/reduce_join_op_test.py
+++ b/tensorflow/python/kernel_tests/reduce_join_op_test.py
@@ -160,7 +160,7 @@ class ReduceJoinTest(UnicodeTestCase):
             separator=separator)
       if not reduction_indices:
         truth = constant_op.constant(truth)
-      truth_squeezed = array_ops.squeeze(truth, squeeze_dims=reduction_indices)
+      truth_squeezed = array_ops.squeeze(truth, axis=reduction_indices)
       output_array = output.eval()
       output_keep_dims_array = output_keep_dims.eval()
       truth_array = truth.eval()
-- 
GitLab


From 9aa142284166c51dfc202b551b4592f9c9ed54e7 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 17:54:26 +0000
Subject: [PATCH 287/450] Fix tf.contrib.timeseries warnings related to
 squeeze_dims

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../timeseries/python/timeseries/state_management_test.py   | 2 +-
 .../python/timeseries/state_space_models/kalman_filter.py   | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py b/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py
index d5dce30fda..5f7e3da2db 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py
@@ -78,7 +78,7 @@ class StubTimeSeriesModel(model.TimeSeriesModel):
     batch_end_values = array_ops.squeeze(
         array_ops.slice(values, [0, array_ops.shape(times)[1] - 1, 0],
                         [-1, 1, -1]),
-        squeeze_dims=[1, 2])
+        axis=[1, 2])
     # A pretty odd but easy to think about loss: L1 loss on the batch end
     # values.
     loss = math_ops.reduce_sum(
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
index 1fcd3e391b..a614386121 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
@@ -170,7 +170,7 @@ class KalmanFilter(object):
         math_ops.matmul(
             transition_matrices,
             prior_state[..., None]),
-        squeeze_dims=[-1])
+        axis=[-1])
     return advanced_state
 
   def predict_state_var(
@@ -254,7 +254,7 @@ class KalmanFilter(object):
             kalman_gain_transposed,
             array_ops.expand_dims(residual, -1),
             adjoint_a=True),
-        squeeze_dims=[-1])
+        axis=[-1])
     gain_obs = math_ops.matmul(
         kalman_gain_transposed, observation_model, adjoint_a=True)
     identity_extradim = linalg_ops.eye(
@@ -332,7 +332,7 @@ class KalmanFilter(object):
             array_ops.expand_dims(state_mean, 1),
             observation_model,
             adjoint_b=True),
-        squeeze_dims=[1])
+        axis=[1])
     observed_var = math_ops.matmul(
         math_ops.matmul(observation_model, state_var),
         observation_model,
-- 
GitLab


From 8257b9096062a87555d72f7c15e16b1d8e748d70 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 17:55:06 +0000
Subject: [PATCH 288/450] Fix warnings in tf.contrib.tensor_forest

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/tensor_forest/client/eval_metrics.py       | 4 ++--
 .../tensor_forest/hybrid/python/layers/fully_connected.py     | 2 +-
 tensorflow/contrib/tensor_forest/python/tensor_forest.py      | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
index 90033015eb..e893e1d1c8 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@@ -37,7 +37,7 @@ def _top_k_generator(k):
   def _top_k(probabilities, targets):
     targets = math_ops.to_int32(targets)
     if targets.get_shape().ndims > 1:
-      targets = array_ops.squeeze(targets, squeeze_dims=[1])
+      targets = array_ops.squeeze(targets, axis=[1])
     return metric_ops.streaming_mean(nn.in_top_k(probabilities, targets, k))
   return _top_k
 
@@ -57,7 +57,7 @@ def _r2(probabilities, targets, weights=None):
 
 
 def _squeeze_and_onehot(targets, depth):
-  targets = array_ops.squeeze(targets, squeeze_dims=[1])
+  targets = array_ops.squeeze(targets, axis=[1])
   return array_ops.one_hot(math_ops.to_int32(targets), depth)
 
 
diff --git a/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py b/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py
index ff3ab21eaa..745a5b1caf 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py
+++ b/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py
@@ -55,7 +55,7 @@ class ManyToOneLayer(hybrid_layer.HybridLayer):
 
       # There is always one activation per instance by definition, so squeeze
       # away the extra dimension.
-      return array_ops.squeeze(nn_activations, squeeze_dims=[1])
+      return array_ops.squeeze(nn_activations, axis=[1])
 
 
 class FlattenedFullyConnectedLayer(hybrid_layer.HybridLayer):
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index b9bcbb170b..7a35a70bbe 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -445,7 +445,7 @@ class RandomForestGraphs(object):
           mask = math_ops.less(
               r, array_ops.ones_like(r) * self.params.bagging_fraction)
           gather_indices = array_ops.squeeze(
-              array_ops.where(mask), squeeze_dims=[1])
+              array_ops.where(mask), axis=[1])
           # TODO(thomaswc): Calculate out-of-bag data and labels, and store
           # them for use in calculating statistics later.
           tree_data = array_ops.gather(processed_dense_features, gather_indices)
-- 
GitLab


From 685fec394235b409b58d7ef1c4a26655f9fedcfd Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 17:55:35 +0000
Subject: [PATCH 289/450] Fix squeeze_dims warnings in tf.contrib.learn

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/learn/python/learn/estimators/head.py | 4 ++--
 tensorflow/contrib/learn/python/learn/ops/losses_ops.py  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index 2b4b6eff39..e28e6854a5 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -777,7 +777,7 @@ class _RegressionHead(_SingleHead):
     key = prediction_key.PredictionKey.SCORES
     with ops.name_scope(None, "predictions", (logits,)):
       if self.logits_dimension == 1:
-        logits = array_ops.squeeze(logits, squeeze_dims=(1,), name=key)
+        logits = array_ops.squeeze(logits, axis=(1,), name=key)
       return {key: self._link_fn(logits)}
 
   def _metrics(self, eval_loss, predictions, labels, weights):
@@ -974,7 +974,7 @@ def _softmax_cross_entropy_loss(labels, logits, weights=None):
     is_squeezed_labels = False
     # TODO(ptucker): This will break for dynamic shapes.
     if len(labels.get_shape()) == 2:
-      labels = array_ops.squeeze(labels, squeeze_dims=(1,))
+      labels = array_ops.squeeze(labels, axis=(1,))
       is_squeezed_labels = True
 
     loss = nn.sparse_softmax_cross_entropy_with_logits(
diff --git a/tensorflow/contrib/learn/python/learn/ops/losses_ops.py b/tensorflow/contrib/learn/python/learn/ops/losses_ops.py
index 92976d1539..9f2cadb017 100644
--- a/tensorflow/contrib/learn/python/learn/ops/losses_ops.py
+++ b/tensorflow/contrib/learn/python/learn/ops/losses_ops.py
@@ -40,7 +40,7 @@ def mean_squared_error_regressor(tensor_in, labels, weights, biases, name=None):
                       [tensor_in, labels]):
     predictions = nn.xw_plus_b(tensor_in, weights, biases)
     if len(labels.get_shape()) == 1 and len(predictions.get_shape()) == 2:
-      predictions = array_ops_.squeeze(predictions, squeeze_dims=[1])
+      predictions = array_ops_.squeeze(predictions, axis=[1])
     return predictions, losses.mean_squared_error(labels, predictions)
 
 
-- 
GitLab


From 5c19fc7810f13712127b8527b040f8f656474fe5 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 17:56:09 +0000
Subject: [PATCH 290/450] Fix tf.contrib.layers warnings where squeeze_dims
 were used with tf.squeeze

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/layers/python/layers/target_column.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/layers/python/layers/target_column.py b/tensorflow/contrib/layers/python/layers/target_column.py
index 3e639a180e..69bb6be814 100644
--- a/tensorflow/contrib/layers/python/layers/target_column.py
+++ b/tensorflow/contrib/layers/python/layers/target_column.py
@@ -270,7 +270,7 @@ class _RegressionTargetColumn(_TargetColumn):
 
   def logits_to_predictions(self, logits, proba=False):
     if self.num_label_columns == 1:
-      return array_ops.squeeze(logits, squeeze_dims=[1])
+      return array_ops.squeeze(logits, axis=[1])
     return logits
 
   def get_eval_ops(self, features, logits, labels, metrics=None):
@@ -418,7 +418,7 @@ def _softmax_cross_entropy_loss(logits, target):
                      "Instead got %s." % target.dtype)
   # sparse_softmax_cross_entropy_with_logits requires [batch_size] target.
   if len(target.get_shape()) == 2:
-    target = array_ops.squeeze(target, squeeze_dims=[1])
+    target = array_ops.squeeze(target, axis=[1])
   loss_vec = nn.sparse_softmax_cross_entropy_with_logits(
       labels=target, logits=logits)
   return loss_vec
-- 
GitLab


From 50a8df144d24ce60866bff96645f04e84a31f8b4 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 17:57:06 +0000
Subject: [PATCH 291/450] Fix warnings in tf.contrib.factorization

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/factorization/python/ops/gmm_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
index ccdd679d6a..e076631bc1 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -397,7 +397,7 @@ class GmmAlgorithm(object):
     # Compute the effective number of data points assigned to component k.
     with ops.control_dependencies(self._w):
       points_in_k = array_ops.squeeze(
-          math_ops.add_n(self._points_in_k), squeeze_dims=[0])
+          math_ops.add_n(self._points_in_k), axis=[0])
       # Update alpha.
       if 'w' in self._params:
         final_points_in_k = points_in_k / num_batches
-- 
GitLab


From 82eacbd4ac29db754b86a0be0cdfcc65b467c6af Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 17:57:31 +0000
Subject: [PATCH 292/450] Fix warnings in tf.contrib.distributions with
 squeeze_dims

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../python/ops/bijectors/cholesky_outer_product.py              | 2 +-
 tensorflow/contrib/distributions/python/ops/shape.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
index caae2adcfa..ecdb8967f4 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
@@ -170,7 +170,7 @@ class CholeskyOuterProduct(bijector.Bijector):
     sum_weighted_log_diag = array_ops.squeeze(
         math_ops.matmul(math_ops.log(diag),
                         exponents[..., array_ops.newaxis]),
-        squeeze_dims=-1)
+        axis=-1)
     fldj = p_float * np.log(2.) + sum_weighted_log_diag
 
     return fldj
diff --git a/tensorflow/contrib/distributions/python/ops/shape.py b/tensorflow/contrib/distributions/python/ops/shape.py
index bac0b79d59..6a7f28713a 100644
--- a/tensorflow/contrib/distributions/python/ops/shape.py
+++ b/tensorflow/contrib/distributions/python/ops/shape.py
@@ -439,7 +439,7 @@ class _DistributionShape(object):
           if self._batch_ndims_is_0 and expand_batch_dim:
             squeeze_dims += [1]
           if squeeze_dims:
-            x = array_ops.squeeze(x, squeeze_dims=squeeze_dims)
+            x = array_ops.squeeze(x, axis=squeeze_dims)
             # x.shape: [prod(S)]+B+E
         _, batch_shape, event_shape = self.get_shape(x)
       else:
-- 
GitLab


From ea0c8a7ed84eb5cdf8ca6a856f9bd05a95597739 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Sun, 22 Apr 2018 12:18:05 -0700
Subject: [PATCH 293/450] [StreamExecutor] [XLA] Delete copy/pasted
 implementations of MakeUnique.

StreamExecutor and XLA had a copy/pasted implementation of MakeUnique,
in namespaces stream_executor::port and xla.

This change removes those implementations and instead pulls
tensorflow::MakeUnique into namespace stream_executor and namespace xla.
We pull it into stream_executor rather than stream_executor::port for
consistency with TF and XLA, which both pull MakeUnique into their own
namespace.

This change also moves MakeUnique and WrapUnique out of namespace
tensorflow::scam_ops::internal -- scam can simply use
tensorflow::{Make,Wrap}Unique.  I suspect the reason it was this way
originally was that TF didn't have Make/WrapUnique.
PiperOrigin-RevId: 193849330
---
 tensorflow/compiler/xla/ptr_util.h            | 22 +---------
 .../xla/service/interpreter/platform.cc       |  4 +-
 tensorflow/stream_executor/BUILD              |  2 +
 .../stream_executor/cuda/cuda_platform.cc     |  4 +-
 .../stream_executor/host/host_platform.cc     |  4 +-
 tensorflow/stream_executor/lib/ptr_util.h     | 42 ++-----------------
 6 files changed, 13 insertions(+), 65 deletions(-)

diff --git a/tensorflow/compiler/xla/ptr_util.h b/tensorflow/compiler/xla/ptr_util.h
index c58c19db2c..bfcdfc62f9 100644
--- a/tensorflow/compiler/xla/ptr_util.h
+++ b/tensorflow/compiler/xla/ptr_util.h
@@ -28,26 +28,8 @@ limitations under the License.
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace xla {
-
-template <typename T>
-std::unique_ptr<T> WrapUnique(T* ptr) {
-  return tensorflow::WrapUnique<T>(ptr);
-}
-
-template <typename T, typename... Args>
-typename tensorflow::helper::MakeUniqueResult<T>::scalar MakeUnique(
-    Args&&... args) {
-  return tensorflow::MakeUnique<T, Args...>(std::forward<Args>(args)...);
-}
-
-// Overload for array of unknown bound.
-// The allocation of arrays needs to use the array form of new,
-// and cannot take element constructor arguments.
-template <typename T>
-typename tensorflow::helper::MakeUniqueResult<T>::array MakeUnique(size_t n) {
-  return tensorflow::MakeUnique<T>(n);
-}
-
+using tensorflow::MakeUnique;
+using tensorflow::WrapUnique;
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_PTR_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc
index ce2f4d378c..92e069a8c6 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform.cc
@@ -71,8 +71,8 @@ port::StatusOr<StreamExecutor*> XlaInterpreterPlatform::GetExecutor(
 port::StatusOr<std::unique_ptr<StreamExecutor>>
 XlaInterpreterPlatform::GetUncachedExecutor(
     const StreamExecutorConfig& config) {
-  auto executor = port::MakeUnique<StreamExecutor>(
-      this, port::MakeUnique<XlaInterpreterExecutor>(config.plugin_config));
+  auto executor = MakeUnique<StreamExecutor>(
+      this, MakeUnique<XlaInterpreterExecutor>(config.plugin_config));
   auto init_status = executor->Init(config.ordinal, config.device_options);
   if (!init_status.ok()) {
     return port::Status{
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 80fc9ff292..c68cda0100 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -35,6 +35,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:lib",
+        "//tensorflow/core:ptr_util",
         "@local_config_cuda//cuda:cuda_headers",
     ],
     alwayslink = 1,
@@ -46,6 +47,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/core:ptr_util",
         "//tensorflow/compiler/xla:statusor",
         "@local_config_cuda//cuda:cuda_headers",
     ] + if_static([":stream_executor_impl"]),
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc
index 7a6ef5a248..649224a20e 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform.cc
@@ -168,8 +168,8 @@ port::StatusOr<StreamExecutor*> CudaPlatform::GetExecutor(
 
 port::StatusOr<std::unique_ptr<StreamExecutor>>
 CudaPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
-  auto executor = port::MakeUnique<StreamExecutor>(
-      this, port::MakeUnique<CUDAExecutor>(config.plugin_config));
+  auto executor = MakeUnique<StreamExecutor>(
+      this, MakeUnique<CUDAExecutor>(config.plugin_config));
   auto init_status = executor->Init(config.ordinal, config.device_options);
   if (!init_status.ok()) {
     return port::Status{
diff --git a/tensorflow/stream_executor/host/host_platform.cc b/tensorflow/stream_executor/host/host_platform.cc
index 00a17a05ed..a652b08b4f 100644
--- a/tensorflow/stream_executor/host/host_platform.cc
+++ b/tensorflow/stream_executor/host/host_platform.cc
@@ -66,8 +66,8 @@ port::StatusOr<StreamExecutor*> HostPlatform::GetExecutor(
 
 port::StatusOr<std::unique_ptr<StreamExecutor>>
 HostPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
-  auto executor = port::MakeUnique<StreamExecutor>(
-      this, port::MakeUnique<HostExecutor>(config.plugin_config));
+  auto executor = MakeUnique<StreamExecutor>(
+      this, MakeUnique<HostExecutor>(config.plugin_config));
   auto init_status = executor->Init(config.ordinal, config.device_options);
   if (!init_status.ok()) {
     return port::Status{
diff --git a/tensorflow/stream_executor/lib/ptr_util.h b/tensorflow/stream_executor/lib/ptr_util.h
index 3f89794688..8f9f420fec 100644
--- a/tensorflow/stream_executor/lib/ptr_util.h
+++ b/tensorflow/stream_executor/lib/ptr_util.h
@@ -17,47 +17,11 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_PTR_UTIL_H_
 
 #include <memory>
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace stream_executor {
-namespace port {
-
-// Trait to select overloads and return types for MakeUnique.
-template <typename T>
-struct MakeUniqueResult {
-  using scalar = std::unique_ptr<T>;
-};
-template <typename T>
-struct MakeUniqueResult<T[]> {
-  using array = std::unique_ptr<T[]>;
-};
-template <typename T, size_t N>
-struct MakeUniqueResult<T[N]> {
-  using invalid = void;
-};
-
-// MakeUnique<T>(...) is an early implementation of C++14 std::make_unique.
-// It is designed to be 100% compatible with std::make_unique so that the
-// eventual switchover will be a simple renaming operation.
-template <typename T, typename... Args>
-typename MakeUniqueResult<T>::scalar MakeUnique(Args&&... args) {  // NOLINT
-  return std::unique_ptr<T>(
-      new T(std::forward<Args>(args)...));  // NOLINT(build/c++11)
-}
-
-// Overload for array of unknown bound.
-// The allocation of arrays needs to use the array form of new,
-// and cannot take element constructor arguments.
-template <typename T>
-typename MakeUniqueResult<T>::array MakeUnique(size_t n) {
-  return std::unique_ptr<T>(new typename std::remove_extent<T>::type[n]());
-}
-
-// Reject arrays of known bound.
-template <typename T, typename... Args>
-typename MakeUniqueResult<T>::invalid MakeUnique(Args&&... /* args */) =
-    delete;  // NOLINT
-
-}  // namespace port
+using tensorflow::MakeUnique;
+using tensorflow::WrapUnique;
 }  // namespace stream_executor
 
 namespace perftools {
-- 
GitLab


From 56fd856425f1322d22796decb1f0580c8fab5d5a Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Sun, 22 Apr 2018 14:48:05 -0700
Subject: [PATCH 294/450] [XLA] Make Executable return a ScopedShapedBuffer.

Previously, we returned a plain ShapedBuffer.  But this doesn't capture
our semantics: It's up to the callee to free this ShapedBuffer.

PiperOrigin-RevId: 193854051
---
 .../compiler/xla/client/local_client.cc       | 12 ++---
 .../xla/service/allocation_tracker.cc         | 45 ++++++++++++-------
 .../compiler/xla/service/allocation_tracker.h | 32 ++++++++-----
 .../xla/service/cpu/cpu_executable.cc         | 14 +++---
 .../compiler/xla/service/cpu/cpu_executable.h |  8 ++--
 .../service/cpu/parallel_cpu_executable.cc    | 10 ++---
 .../xla/service/cpu/parallel_cpu_executable.h |  4 +-
 tensorflow/compiler/xla/service/executable.cc |  8 ++--
 tensorflow/compiler/xla/service/executable.h  |  8 ++--
 .../xla/service/gpu/gpu_executable.cc         |  8 ++--
 .../compiler/xla/service/gpu/gpu_executable.h |  4 +-
 tensorflow/compiler/xla/service/hlo_runner.cc | 14 ++----
 .../xla/service/interpreter/executable.cc     |  8 ++--
 .../xla/service/interpreter/executable.h      |  4 +-
 tensorflow/compiler/xla/service/service.cc    | 14 +++---
 .../compiler/xla/service/shaped_buffer.cc     |  4 +-
 .../compiler/xla/service/shaped_buffer.h      |  6 +++
 .../compiler/xla/service/transfer_manager.cc  | 15 ++-----
 .../compiler/xla/service/transfer_manager.h   |  5 +--
 tensorflow/compiler/xla/tests/fusion_test.cc  |  6 +--
 20 files changed, 119 insertions(+), 110 deletions(-)

diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index d0e945b70f..1c12705903 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -166,12 +166,8 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
   if (executable_->dumping()) {
     return ExecuteAndDump(&service_options, arguments);
   }
-  TF_ASSIGN_OR_RETURN(
-      ShapedBuffer result,
-      executable_->ExecuteOnStreamWrapper(
-          &service_options, run_options.execution_profile(), arguments));
-
-  return ScopedShapedBuffer(std::move(result), run_options.allocator());
+  return executable_->ExecuteOnStreamWrapper(
+      &service_options, run_options.execution_profile(), arguments);
 }
 
 StatusOr<ScopedShapedBuffer> LocalExecutable::ExecuteAndDump(
@@ -181,12 +177,12 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::ExecuteAndDump(
       backend_->platform()->Name());
   TF_RETURN_IF_ERROR(RecordArguments(arguments, executable_->session_module()));
   TF_ASSIGN_OR_RETURN(
-      ShapedBuffer result,
+      ScopedShapedBuffer result,
       executable_->ExecuteOnStream(run_options, arguments,
                                    /*hlo_execution_profile=*/nullptr));
   TF_RETURN_IF_ERROR(RecordResult(&result, executable_->session_module()));
   TF_RETURN_IF_ERROR(executable_->DumpSessionModule());
-  return ScopedShapedBuffer(std::move(result), run_options->allocator());
+  return std::move(result);
 }
 
 tensorflow::Status LocalExecutable::RecordArguments(
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index 6bf65825cd..cf1231bcce 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -31,23 +31,35 @@ limitations under the License.
 namespace xla {
 
 StatusOr<GlobalDataHandle> AllocationTracker::Register(
-    ShapedBuffer shaped_buffer, const string& tag) {
+    ScopedShapedBuffer shaped_buffer, const string& tag) {
   tensorflow::mutex_lock lock(mutex_);
   VLOG(2) << "Register";
-  std::vector<ShapedBuffer> replicated_buffers;
+  std::vector<ScopedShapedBuffer> replicated_buffers;
   replicated_buffers.emplace_back(std::move(shaped_buffer));
   return RegisterInternal(std::move(replicated_buffers), tag);
 }
 
 StatusOr<GlobalDataHandle> AllocationTracker::RegisterReplicatedBuffers(
-    std::vector<ShapedBuffer> replicated_buffers, const string& tag) {
+    std::vector<ScopedShapedBuffer> replicated_buffers, const string& tag) {
   tensorflow::mutex_lock lock(mutex_);
   VLOG(2) << "RegisterReplicatedBuffers";
   return RegisterInternal(std::move(replicated_buffers), tag);
 }
 
+// ReleaseIfScopedShapedBuffer lets RegisterInternal<ShapedBufferTy>(b) call
+// b.release() if b is a ScopedShapedBuffer, or otherwise pass b through
+// unmodified.
+static ShapedBuffer ReleaseIfScopedShapedBuffer(ShapedBuffer b) { return b; }
+static ShapedBuffer ReleaseIfScopedShapedBuffer(ScopedShapedBuffer b) {
+  return b.release();
+}
+
+template <typename ShapedBufferTy>
 StatusOr<GlobalDataHandle> AllocationTracker::RegisterInternal(
-    std::vector<ShapedBuffer> replicated_buffers, const string& tag) {
+    std::vector<ShapedBufferTy> replicated_buffers, const string& tag) {
+  static_assert(std::is_same<ShapedBufferTy, ShapedBuffer>::value ||
+                    std::is_same<ShapedBufferTy, ScopedShapedBuffer>::value,
+                "ShapedBufferTy must be ShapedBuffer or ScopedShapedBuffer.");
   VLOG(2) << "RegisterInternal("
           << "tag: \"" << tag << "\" with " << replicated_buffers.size()
           << " shaped_buffers.";
@@ -65,17 +77,22 @@ StatusOr<GlobalDataHandle> AllocationTracker::RegisterInternal(
   int64 handle = next_handle_++;
   for (auto& shaped_buffer : replicated_buffers) {
     std::vector<ShapeIndex> shape_indices;
-    ShapeUtil::ForEachSubshape(shaped_buffer.on_device_shape(),
-                               [this, &shape_indices](const Shape& /*subshape*/,
-                                                      const ShapeIndex& index) {
-                                 shape_indices.push_back(index);
-                               });
+    ShapeUtil::ForEachSubshape(
+        shaped_buffer.on_device_shape(),
+        [&](const Shape& /*subshape*/, const ShapeIndex& index) {
+          shape_indices.push_back(index);
+        });
+    // Add shaped_buffer's buffers to opaque_to_allocation_map_, which owns
+    // them.
     for (const ShapeIndex& index : shape_indices) {
       AddAllocationOrIncrementRefCount(shaped_buffer.buffer(index),
                                        shaped_buffer.device_ordinal());
     }
-    handle_to_shaped_buffers_[handle].emplace_back(
-        MakeUnique<ShapedBuffer>(std::move(shaped_buffer)));
+    // If ShapedBufferTy is ScopedShapedBuffer, release the ScopedShapedBuffer
+    // into a regular ShapedBuffer, which is stored in
+    // handle_to_shaped_buffers_.
+    handle_to_shaped_buffers_[handle].emplace_back(MakeUnique<ShapedBuffer>(
+        ReleaseIfScopedShapedBuffer(std::move(shaped_buffer))));
   }
 
   GlobalDataHandle result;
@@ -102,10 +119,6 @@ tensorflow::Status AllocationTracker::Unregister(const GlobalDataHandle& data) {
                                            shaped_buffer->device_ordinal()));
     }
   }
-  return Reset(data);
-}
-
-Status AllocationTracker::Reset(const GlobalDataHandle& data) {
   // Keep a nullptr as a tombstone for unregistered handles. This enables
   // better error messages. That is, "handle has been deallocated" versus
   // "handle does not exist".
@@ -152,7 +165,7 @@ StatusOr<std::vector<GlobalDataHandle>> AllocationTracker::DeconstructTuple(
     element_buffer.set_buffer(shaped_buffer->buffer(/*index=*/{i}),
                               /*index=*/{});
     std::vector<ShapedBuffer> replicated_buffers;
-    replicated_buffers.emplace_back(std::move(element_buffer));
+    replicated_buffers.push_back(std::move(element_buffer));
     TF_ASSIGN_OR_RETURN(
         GlobalDataHandle element_handle,
         RegisterInternal(std::move(replicated_buffers), "deconstructed tuple"));
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h
index 2bfcd53712..1174fa641c 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@@ -45,13 +45,13 @@ class AllocationTracker {
   // Registers a shaped buffer of device memory, and returns a corresponding
   // handle that can be used for talking to XLA clients. The given shaped buffer
   // will be treated as the buffer corresponding to the only replica.
-  StatusOr<GlobalDataHandle> Register(ShapedBuffer shaped_buffer,
+  StatusOr<GlobalDataHandle> Register(ScopedShapedBuffer shaped_buffer,
                                       const string& tag);
 
   // Registers a vector of shaped buffers of device memory, one per replica, and
   // returns a corresponding handle that can be used for talking to XLA clients.
   StatusOr<GlobalDataHandle> RegisterReplicatedBuffers(
-      std::vector<ShapedBuffer> replicated_buffers, const string& tag);
+      std::vector<ScopedShapedBuffer> replicated_buffers, const string& tag);
 
   // Unregister the allocation for the given data handle.
   Status Unregister(const GlobalDataHandle& data);
@@ -87,21 +87,21 @@ class AllocationTracker {
   };
 
   // Internal helper which resolves the given GlobalDataHandle to a
-  // ShapedBuffer.
+  // list of ScopedShapedBuffers.
   StatusOr<std::vector<const ShapedBuffer*>> ResolveInternal(
       const GlobalDataHandle& data) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Internal helper which registers a vector of shaped buffers, one per
-  // replica.
+  // replica.  ShapedBufferTy is either ScopedShapedBuffer or ShapedBuffer.  If
+  // it's ShapedBuffer, all of the given buffers must already be tracked by this
+  // object -- presumably this is a call from DeconstructTuple.
+  template <typename ShapedBufferTy>
   StatusOr<GlobalDataHandle> RegisterInternal(
-      std::vector<ShapedBuffer> replicated_buffers, const string& tag)
+      std::vector<ShapedBufferTy> replicated_buffers, const string& tag)
       EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
-  // Resets the shaped buffers corresponding to the given handle.
-  Status Reset(const GlobalDataHandle& data) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
   // Adds the given device address to the allocation tracker, or if it already
-  // exists, then increment it's reference count.
+  // exists, then increment its reference count.
   void AddAllocationOrIncrementRefCount(se::DeviceMemoryBase device_memory,
                                         int device_ordinal)
       EXCLUSIVE_LOCKS_REQUIRED(mutex_);
@@ -133,7 +133,19 @@ class AllocationTracker {
   // buffers for different replicas.
   //
   // The ShapedBuffers in this map's vectors need to be unique_ptrs, because our
-  // public API returns pointers to them.
+  // public API returns pointers to them.  We expect the concrete class to be
+  // ShapedBuffer and never ScopedShapedBuffer; deallocation of buffers is
+  // handled by opaque_to_allocation_map_.
+  //
+  // The elements of the vectors need to be unique_ptrs because we return
+  // pointers to them.  (In theory we could use std::list or something instead,
+  // but we also want to be able to null out these elements.)
+  //
+  // The reason that the elements can't be unique_ptr<ScopedShapedBuffer>s is
+  // the existence of DeconstructTuple().  This function allows us to create a
+  // non-owning "view" into a tuple's sub-buffers.  The sub-buffers are then
+  // free'd when both the view *and* the original tuple are Unregistered.  This
+  // refcounting is managed in opaque_to_allocation_map_.
   tensorflow::gtl::FlatMap<int64, std::vector<std::unique_ptr<ShapedBuffer>>>
       handle_to_shaped_buffers_ GUARDED_BY(mutex_);
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 97e550abe4..aabf4d5161 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -243,14 +243,14 @@ static Status DeallocateTempBuffers(
   return Status::OK();
 }
 
-StatusOr<ShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
+StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
     std::vector<bool>* buffers_in_result) {
   se::Stream* stream = run_options->stream();
-  ShapedBuffer result_buffer(
+  ScopedShapedBuffer result_buffer(
       /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(),
-      stream->parent()->platform(), stream->parent()->device_ordinal());
+      run_options->allocator(), stream->parent()->device_ordinal());
 
   // Copy DeviceMemoryBase values which contain the array(s) of the result into
   // the respective location in ShapedBuffer which is returned to the caller.
@@ -281,7 +281,7 @@ StatusOr<ShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
   return std::move(result_buffer);
 }
 
-StatusOr<ShapedBuffer> CpuExecutable::ExecuteOnStream(
+StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
@@ -300,7 +300,7 @@ StatusOr<ShapedBuffer> CpuExecutable::ExecuteOnStream(
 
   std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
   TF_ASSIGN_OR_RETURN(
-      ShapedBuffer result_buffer,
+      ScopedShapedBuffer result_buffer,
       CreateResultShapedBuffer(run_options, buffers, &buffers_in_result));
 
   // Free all buffers not in the result.
@@ -310,7 +310,7 @@ StatusOr<ShapedBuffer> CpuExecutable::ExecuteOnStream(
   return std::move(result_buffer);
 }
 
-StatusOr<ShapedBuffer> CpuExecutable::ExecuteAsyncOnStream(
+StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   if (hlo_profiling_enabled()) {
@@ -330,7 +330,7 @@ StatusOr<ShapedBuffer> CpuExecutable::ExecuteAsyncOnStream(
 
   std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
   TF_ASSIGN_OR_RETURN(
-      ShapedBuffer result_buffer,
+      ScopedShapedBuffer result_buffer,
       CreateResultShapedBuffer(run_options, buffers, &buffers_in_result));
 
   LogLiveAddresses(buffers, buffers_in_result);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 06b6943cb5..68ad38cba8 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -55,12 +55,12 @@ class CpuExecutable : public Executable {
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
   ~CpuExecutable() override {}
 
-  StatusOr<ShapedBuffer> ExecuteOnStream(
+  StatusOr<ScopedShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<ShapedBuffer> ExecuteAsyncOnStream(
+  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
@@ -102,13 +102,13 @@ class CpuExecutable : public Executable {
       tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
       HloExecutionProfile* hlo_execution_profile);
 
-  // Creates a ShapedBuffer for holding the result of the computation. The
+  // Creates a ScopedShapedBuffer for holding the result of the computation. The
   // addresses (DeviceMemoryBases) are set according to buffer assignment.
   // 'buffers_in_result' should point to a vector of the same size as
   // 'allocated_buffers'. An element in buffers_in_result is set to true if the
   // corresponding buffer is live out of the computation (and thus contained in
   // the returned ShapedBuffer).
-  StatusOr<ShapedBuffer> CreateResultShapedBuffer(
+  StatusOr<ScopedShapedBuffer> CreateResultShapedBuffer(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
       std::vector<bool>* buffers_in_result);
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
index a2bd4fa195..035f9ddb2e 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@@ -447,7 +447,7 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
   return Status::OK();
 }
 
-StatusOr<ShapedBuffer> ParallelCpuExecutable::ExecuteOnStream(
+StatusOr<ScopedShapedBuffer> ParallelCpuExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
@@ -459,9 +459,9 @@ StatusOr<ShapedBuffer> ParallelCpuExecutable::ExecuteOnStream(
   DeviceMemoryAllocator* memory_allocator = run_options->allocator();
   std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
 
-  ShapedBuffer result_buffer(
+  ScopedShapedBuffer result_buffer(
       /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(),
-      stream->parent()->platform(), stream->parent()->device_ordinal());
+      run_options->allocator(), stream->parent()->device_ordinal());
 
   TF_RETURN_IF_ERROR(AllocateBuffers(
       memory_allocator, stream->parent()->device_ordinal(), &buffers));
@@ -470,7 +470,7 @@ StatusOr<ShapedBuffer> ParallelCpuExecutable::ExecuteOnStream(
                                              hlo_execution_profile));
 
   // Copy DeviceMemoryBase values which into the respective location in
-  // ShapedBuffer which is returned to the caller.
+  // the ScopedShapedBuffer which is returned to the caller.
   std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
   TF_RETURN_IF_ERROR(result_buffer.buffers().ForEachMutableElementWithStatus(
       [&](const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
@@ -511,7 +511,7 @@ StatusOr<ShapedBuffer> ParallelCpuExecutable::ExecuteOnStream(
   return std::move(result_buffer);
 }
 
-StatusOr<ShapedBuffer> ParallelCpuExecutable::ExecuteAsyncOnStream(
+StatusOr<ScopedShapedBuffer> ParallelCpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   // TODO(b/30671675): Implement asynchronous execution mode.
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
index 5ce84fa996..55f8331b59 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
@@ -59,12 +59,12 @@ class ParallelCpuExecutable : public Executable {
       std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
   ~ParallelCpuExecutable() override {}
 
-  StatusOr<ShapedBuffer> ExecuteOnStream(
+  StatusOr<ScopedShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<ShapedBuffer> ExecuteAsyncOnStream(
+  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index be19b3ff04..021f09d310 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -29,12 +29,12 @@ using tensorflow::gtl::ArraySlice;
 
 namespace xla {
 
-StatusOr<std::vector<ShapedBuffer>> Executable::ExecuteOnStreams(
+StatusOr<std::vector<ScopedShapedBuffer>> Executable::ExecuteOnStreams(
     ArraySlice<const ServiceExecutableRunOptions> run_options,
     ArraySlice<ArraySlice<const ShapedBuffer*>> arguments) {
   TF_RET_CHECK(run_options.size() == arguments.size());
 
-  std::vector<ShapedBuffer> return_values;
+  std::vector<ScopedShapedBuffer> return_values;
   return_values.reserve(run_options.size());
 
   if (run_options.size() == 1) {
@@ -60,7 +60,7 @@ StatusOr<std::vector<ShapedBuffer>> Executable::ExecuteOnStreams(
   return std::move(return_values);
 }
 
-StatusOr<ShapedBuffer> Executable::ExecuteOnStreamWrapper(
+StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStreamWrapper(
     const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile,
     ArraySlice<const ShapedBuffer*> arguments) {
   se::Stream* stream = run_options->stream();
@@ -80,7 +80,7 @@ StatusOr<ShapedBuffer> Executable::ExecuteOnStreamWrapper(
                                             &hlo_profile_index_map())
           : nullptr;
 
-  StatusOr<ShapedBuffer> return_value =
+  StatusOr<ScopedShapedBuffer> return_value =
       ExecuteOnStream(run_options, arguments, profile_ptr.get());
   TF_RETURN_IF_ERROR(return_value.status());
 
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 0c95f1a361..f7af1ca574 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -63,14 +63,14 @@ class Executable {
   // enabled.
   //
   // Returns a shaped buffer containing the result of the computation.
-  virtual StatusOr<ShapedBuffer> ExecuteOnStream(
+  virtual StatusOr<ScopedShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) = 0;
 
   // Same as ExecuteOnStream(), but this call is non-blocking and returns as
   // soon as all of the operations are enqueued for launch on the stream.
-  virtual StatusOr<ShapedBuffer> ExecuteAsyncOnStream(
+  virtual StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) = 0;
 
@@ -78,7 +78,7 @@ class Executable {
   // streams. arguments[i] contains the arguments to the execution on
   // run_options[i]->stream() and the returned value is at index i of the
   // returned vector.
-  virtual StatusOr<std::vector<ShapedBuffer>> ExecuteOnStreams(
+  virtual StatusOr<std::vector<ScopedShapedBuffer>> ExecuteOnStreams(
       tensorflow::gtl::ArraySlice<const ServiceExecutableRunOptions>
           run_options,
       tensorflow::gtl::ArraySlice<
@@ -98,7 +98,7 @@ class Executable {
   // Convenience wrapper for calling Executable::ExecuteOnStream. Sets up a
   // timer for the execution, sets up HLO profiling if enabled, and fills in the
   // given ExecutionProfile if non-null.
-  StatusOr<ShapedBuffer> ExecuteOnStreamWrapper(
+  StatusOr<ScopedShapedBuffer> ExecuteOnStreamWrapper(
       const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments);
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 62ce15bc59..980cc89fa0 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -250,7 +250,7 @@ Status GpuExecutable::ExecuteThunks(
   return Status::OK();
 }
 
-StatusOr<ShapedBuffer> GpuExecutable::ExecuteOnStream(
+StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
@@ -297,8 +297,8 @@ StatusOr<ShapedBuffer> GpuExecutable::ExecuteOnStream(
 
   HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
   auto device_ordinal = executor->device_ordinal();
-  auto shaped_buffer = ShapedBuffer(root->shape(), root->shape(),
-                                    executor->platform(), device_ordinal);
+  ScopedShapedBuffer shaped_buffer(root->shape(), root->shape(),
+                                   memory_allocator, device_ordinal);
 
   // Copy DeviceMemoryBase values which contain the array(s) of the result into
   // the respective location in ShapedBuffer.
@@ -335,7 +335,7 @@ StatusOr<ShapedBuffer> GpuExecutable::ExecuteOnStream(
   return std::move(shaped_buffer);
 }
 
-StatusOr<ShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
+StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   // TODO(b/30671675): Implement asynchronous execution mode.
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 361bc30b2f..80ec38c3ac 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -74,12 +74,12 @@ class GpuExecutable : public Executable {
 
   // ExecuteOnStream will fail if the compute capability of the stream doesn't
   // match the compute capability passed to this object's constructor.
-  StatusOr<ShapedBuffer> ExecuteOnStream(
+  StatusOr<ScopedShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<ShapedBuffer> ExecuteAsyncOnStream(
+  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index df5ffd0b7d..81c43db292 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -126,16 +126,12 @@ StatusOr<std::unique_ptr<Literal>> HloRunner::Execute(
   }
 
   TF_ASSIGN_OR_RETURN(
-      ShapedBuffer result,
+      ScopedShapedBuffer result,
       executable->ExecuteOnStreamWrapper(
           &service_run_options, /*profile=*/nullptr, argument_buffer_ptrs));
 
-  // Create a ScopedShapedBuffer of the result to manage deallocation. This will
-  // deallocate all the device memory when it goes out of scope.
-  ScopedShapedBuffer scoped_result(std::move(result), run_options.allocator());
-
   auto result_literal = backend().transfer_manager()->TransferLiteralFromDevice(
-      stream.parent(), scoped_result);
+      stream.parent(), result);
   if (result_literal.ok()) {
     VLOG(4) << "Executed binary and got result: "
             << result_literal.ValueOrDie()->ToString();
@@ -248,18 +244,16 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
   }
 
   LOG(INFO) << "Replicated execution started";
-  TF_ASSIGN_OR_RETURN(std::vector<ShapedBuffer> results,
+  TF_ASSIGN_OR_RETURN(std::vector<ScopedShapedBuffer> results,
                       executable->ExecuteOnStreams(service_run_options,
                                                    argument_buffer_slices));
   LOG(INFO) << "Replicated execution terminated";
 
   std::vector<std::unique_ptr<Literal>> exec_results;
   for (int64 i = 0; i < options.num_replicas; ++i) {
-    ScopedShapedBuffer result(std::move(results[i]),
-                              backend().memory_allocator());
     TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
                         backend().transfer_manager()->TransferLiteralFromDevice(
-                            streams[i]->parent(), result));
+                            streams[i]->parent(), results[i]));
     exec_results.push_back(std::move(literal));
   }
   return std::move(exec_results);
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index 6553000336..61f199bc9e 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -45,7 +45,7 @@ InterpreterExecutable::InterpreterExecutable(
 
 InterpreterExecutable::~InterpreterExecutable() {}
 
-StatusOr<ShapedBuffer> InterpreterExecutable::ExecuteOnStream(
+StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
@@ -88,8 +88,8 @@ StatusOr<ShapedBuffer> InterpreterExecutable::ExecuteOnStream(
       evaluator.Evaluate<std::unique_ptr<Literal>>(*computation, arg_literals));
 
   // Transform the result literal back into a ShapedBuffer.
-  TF_ASSIGN_OR_RETURN(ShapedBuffer result,
-                      transfer_manager->AllocateShapedBuffer(
+  TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result,
+                      transfer_manager->AllocateScopedShapedBuffer(
                           result_literal->shape(), run_options->allocator(),
                           executor->device_ordinal()));
   TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralToDevice(
@@ -106,7 +106,7 @@ StatusOr<ShapedBuffer> InterpreterExecutable::ExecuteOnStream(
   return std::move(result);
 }
 
-StatusOr<ShapedBuffer> InterpreterExecutable::ExecuteAsyncOnStream(
+StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   return tensorflow::errors::Unimplemented(
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h
index c825a9a368..b0b797ca7d 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@@ -43,12 +43,12 @@ class InterpreterExecutable : public Executable {
   InterpreterExecutable(std::unique_ptr<const HloModule> hlo_module);
   ~InterpreterExecutable() override;
 
-  StatusOr<ShapedBuffer> ExecuteOnStream(
+  StatusOr<ScopedShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<ShapedBuffer> ExecuteAsyncOnStream(
+  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index a73118c68a..e8403c9e95 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -553,7 +553,7 @@ Service::ExecuteParallelAndRegisterResult(
     // Stream executors for the replicas of the current computation.
     TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*backend, device_handles[i]));
     CHECK_EQ(replicas.size(), arguments[i].size());
-    std::vector<ShapedBuffer> result_buffers;
+    std::vector<ScopedShapedBuffer> result_buffers;
     for (int64 replica = 0; replica < replicas.size(); ++replica) {
       TF_ASSIGN_OR_RETURN(Pool<se::Stream>::SmartPtr stream,
                           backend->BorrowStream(replicas[replica]));
@@ -585,7 +585,7 @@ Service::ExecuteParallelAndRegisterResult(
                                               backend->StreamBorrower());
 
       // Asynchronously launch the computation.
-      TF_ASSIGN_OR_RETURN(ShapedBuffer result,
+      TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result,
                           executables[i]->ExecuteAsyncOnStream(
                               &run_options, arguments[i][replica]));
 
@@ -1237,7 +1237,7 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
     streams.push_back(std::move(stream));
   }
 
-  std::vector<ShapedBuffer> result_buffers;
+  std::vector<ScopedShapedBuffer> result_buffers;
   for (size_t i = 0; i < streams.size(); ++i) {
     const auto& stream = streams[i];
     ExecutableRunOptions options;
@@ -1250,7 +1250,7 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
     ServiceExecutableRunOptions service_options(
         options, execute_backend_->StreamBorrower());
 
-    TF_ASSIGN_OR_RETURN(ShapedBuffer this_result_buffer,
+    TF_ASSIGN_OR_RETURN(ScopedShapedBuffer this_result_buffer,
                         executable->ExecuteAsyncOnStream(
                             &service_options, replicated_arguments[i]));
 
@@ -1350,11 +1350,11 @@ tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
   }
 
   // Allocate memory in each replica and transfer the data to all replicas.
-  std::vector<ShapedBuffer> replicated_buffers;
+  std::vector<ScopedShapedBuffer> replicated_buffers;
   for (se::StreamExecutor* executor : replicas) {
     TF_ASSIGN_OR_RETURN(
-        ShapedBuffer shaped_buffer,
-        execute_backend_->transfer_manager()->AllocateShapedBuffer(
+        ScopedShapedBuffer shaped_buffer,
+        execute_backend_->transfer_manager()->AllocateScopedShapedBuffer(
             shape, execute_backend_->memory_allocator(),
             executor->device_ordinal()));
     TF_RETURN_IF_ERROR(
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index 0b5a383f6f..fb3b5f06da 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -117,7 +117,7 @@ ScopedShapedBuffer::ScopedShapedBuffer(ShapedBuffer shaped_buffer,
     : ShapedBuffer(std::move(shaped_buffer)), allocator_(allocator) {}
 
 ScopedShapedBuffer::ScopedShapedBuffer(ScopedShapedBuffer&& s)
-    : ShapedBuffer(std::move(s)), allocator_(s.allocator_) {
+    : ShapedBuffer(static_cast<ShapedBuffer&&>(s)), allocator_(s.allocator_) {
   // Null out s.allocator_ so it doesn't try to free anything in its destructor.
   s.allocator_ = nullptr;
 }
@@ -151,7 +151,7 @@ ScopedShapedBuffer::~ScopedShapedBuffer() {
 }
 
 ShapedBuffer ScopedShapedBuffer::release() {
-  ShapedBuffer shaped_buffer(std::move(*this));
+  ShapedBuffer shaped_buffer(static_cast<ShapedBuffer&&>(*this));
   buffers_ = ShapeTree<se::DeviceMemoryBase>();
   return shaped_buffer;
 }
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index f1b0527474..e10fca9e94 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -30,6 +30,8 @@ limitations under the License.
 
 namespace xla {
 
+class ScopedShapedBuffer;
+
 // Class which encapsulates a buffer or set of buffers containing data of a
 // particular XLA shape.
 class ShapedBuffer {
@@ -49,6 +51,10 @@ class ShapedBuffer {
   ShapedBuffer(const ShapedBuffer&) = delete;
   ShapedBuffer& operator=(const ShapedBuffer&) = delete;
 
+  // Prevent (some forms of) accidental object slicing.
+  ShapedBuffer(const ScopedShapedBuffer&) = delete;
+  ShapedBuffer& operator=(const ScopedShapedBuffer&) = delete;
+
   virtual ~ShapedBuffer();
 
   // Returns the shape of the on-host representation of the data held by this
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index 98d0111d04..8b71a41509 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -175,7 +175,7 @@ Status TransferManager::TransferBufferToDevice(
   return Status::OK();
 }
 
-StatusOr<ShapedBuffer> TransferManager::AllocateShapedBuffer(
+StatusOr<ScopedShapedBuffer> TransferManager::AllocateScopedShapedBuffer(
     const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
     int device_ordinal) {
   if (!LayoutUtil::HasLayout(on_host_shape)) {
@@ -187,8 +187,8 @@ StatusOr<ShapedBuffer> TransferManager::AllocateShapedBuffer(
   const Shape on_device_shape = HostShapeToDeviceShape(on_host_shape);
   TF_RET_CHECK(LayoutUtil::HasLayout(on_device_shape));
 
-  ShapedBuffer shaped_buffer(on_host_shape, on_device_shape,
-                             allocator->platform(), device_ordinal);
+  ScopedShapedBuffer shaped_buffer(on_host_shape, on_device_shape, allocator,
+                                   device_ordinal);
 
   // Allocate an appropriate sized buffer for each element in the shape
   // including the tuple pointer arrays.
@@ -204,13 +204,4 @@ StatusOr<ShapedBuffer> TransferManager::AllocateShapedBuffer(
   return std::move(shaped_buffer);
 }
 
-StatusOr<ScopedShapedBuffer> TransferManager::AllocateScopedShapedBuffer(
-    const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
-    int device_ordinal) {
-  TF_ASSIGN_OR_RETURN(
-      ShapedBuffer unscoped_buffer,
-      AllocateShapedBuffer(on_host_shape, allocator, device_ordinal));
-  return ScopedShapedBuffer(std::move(unscoped_buffer), allocator);
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index a6451c4bb1..d82b4f0f81 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -104,12 +104,9 @@ class TransferManager {
   // region for a host-to-device transfer.
   virtual int64 GetByteSizeRequirement(const Shape& shape) const = 0;
 
-  // Allocate a ShapedBuffer which can hold data with the given on-host
+  // Allocates a ScopedShapedBuffer which can hold data with the given on-host
   // shape. The on-device shape may be different as indicated by
   // HostShapeToDeviceShape.
-  StatusOr<ShapedBuffer> AllocateShapedBuffer(const Shape& on_host_shape,
-                                              DeviceMemoryAllocator* allocator,
-                                              int device_ordinal);
   StatusOr<ScopedShapedBuffer> AllocateScopedShapedBuffer(
       const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
       int device_ordinal);
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index c7f64d8560..6f89e9164c 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -794,19 +794,19 @@ void BM_ParallelFusion(int num_iters) {
   // Transfer literals to device.
   auto param0_literal =
       Literal::CreateR2F32Linspace(1.0, 2.0, param0_dim0, param0_dim1);
-  ShapedBuffer buffer0 =
+  ScopedShapedBuffer buffer0 =
       client->LiteralToShapedBuffer(*param0_literal, device_ordinal)
           .ConsumeValueOrDie();
 
   auto param1_literal =
       Literal::CreateR2F32Linspace(1.0, 2.0, param1_dim0, param1_dim1);
-  ShapedBuffer buffer1 =
+  ScopedShapedBuffer buffer1 =
       client->LiteralToShapedBuffer(*param1_literal, device_ordinal)
           .ConsumeValueOrDie();
 
   auto param2_literal =
       Literal::CreateR2F32Linspace(1.0, 2.0, param2_dim0, param2_dim1);
-  ShapedBuffer buffer2 =
+  ScopedShapedBuffer buffer2 =
       client->LiteralToShapedBuffer(*param2_literal, device_ordinal)
           .ConsumeValueOrDie();
 
-- 
GitLab


From c1544d1c34dac9aa01ed2de84bc850f8d1bfe919 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Sun, 22 Apr 2018 19:08:21 -0700
Subject: [PATCH 295/450] Update tuple for cuda version with auto as it was
 removed in #18434.

---
 tensorflow/core/kernels/conv_ops_gpu.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index 7f9cfec981..bbd5a53660 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -143,8 +143,7 @@ class ConvParameters {
   bool ShouldIncludeWinogradNonfusedAlgo(
       perftools::gputools::StreamExecutor* stream_exec) const {
     // Skip this check for cuDNN 7 and newer.
-    perftools::gputools::port::StatusOr<std::tuple<int, int, int>> version =
-        stream_exec->AsDnn()->GetVersion();
+    auto version = stream_exec->AsDnn()->GetVersion();
     if (version.ok() && std::get<0>(version.ValueOrDie()) >= 7) {
       return true;
     }
-- 
GitLab


From e5cfbd0eceb4dca98b388b13acff499a5420f863 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Sun, 22 Apr 2018 20:00:54 -0700
Subject: [PATCH 296/450] Fix more for cuda version check.

---
 tensorflow/core/kernels/conv_ops_gpu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index bbd5a53660..e8da5298e6 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -144,7 +144,7 @@ class ConvParameters {
       perftools::gputools::StreamExecutor* stream_exec) const {
     // Skip this check for cuDNN 7 and newer.
     auto version = stream_exec->AsDnn()->GetVersion();
-    if (version.ok() && std::get<0>(version.ValueOrDie()) >= 7) {
+    if (version.ok() && version.ValueOrDie().major_version() >= 7) {
       return true;
     }
     return ShouldIncludeWinogradNonfusedAlgoPreCudnn7<T>();
-- 
GitLab


From 734636640534cd9478a7465c3975031a089629ea Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 22 Apr 2018 22:04:22 -0700
Subject: [PATCH 297/450] Rm references to SubmodelPort

PiperOrigin-RevId: 193873101
---
 tensorflow/contrib/optimizer_v2/optimizer_v2.py | 15 ---------------
 tensorflow/python/training/optimizer.py         | 15 ---------------
 2 files changed, 30 deletions(-)

diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index 25d19578ea..dcb5bb6416 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -125,19 +125,6 @@ class _DenseResourceVariableProcessor(_OptimizableVariable):
       return update_op
 
 
-class _StreamingModelPortProcessor(_OptimizableVariable):
-  """Processor for streaming ModelPorts."""
-
-  def __init__(self, v):
-    self._v = v
-
-  def target(self):
-    return self._v
-
-  def update_op(self, optimizer, g, *args):
-    return g
-
-
 class _TensorProcessor(_OptimizableVariable):
   """Processor for ordinary Tensors.
 
@@ -167,8 +154,6 @@ def _get_processor(v):
     return _DenseResourceVariableProcessor(v)
   if isinstance(v, variables.Variable):
     return _RefVariableProcessor(v)
-  if v.op.type == "SubmodelPort":
-    return _StreamingModelPortProcessor(v)
   if isinstance(v, ops.Tensor):
     return _TensorProcessor(v)
   raise NotImplementedError("Trying to optimize unsupported type ", v)
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index f126d3847b..66914bacf3 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -170,19 +170,6 @@ class _DenseResourceVariableProcessor(_OptimizableVariable):
       return update_op
 
 
-class _StreamingModelPortProcessor(_OptimizableVariable):
-  """Processor for streaming ModelPorts."""
-
-  def __init__(self, v):
-    self._v = v
-
-  def target(self):
-    return self._v
-
-  def update_op(self, optimizer, g):
-    return g
-
-
 class _TensorProcessor(_OptimizableVariable):
   """Processor for ordinary Tensors.
 
@@ -216,8 +203,6 @@ def _get_processor(v):
     return _DenseResourceVariableProcessor(v)
   if isinstance(v, variables.Variable):
     return _RefVariableProcessor(v)
-  if v.op.type == "SubmodelPort":
-    return _StreamingModelPortProcessor(v)
   if isinstance(v, ops.Tensor):
     return _TensorProcessor(v)
   raise NotImplementedError("Trying to optimize unsupported type ", v)
-- 
GitLab


From 97bc1d90b385d06400376ceba8a924f4982c0434 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 22 Apr 2018 22:17:13 -0700
Subject: [PATCH 298/450] Init struct bools to false to prevent warnings by
 dynamic type checking programs when an uninitialized value is read by
 operator=.

PiperOrigin-RevId: 193873776
---
 tensorflow/core/framework/collective.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index 40d82ab0e9..0943b85fba 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -80,7 +80,7 @@ struct CollInstanceParams {
   // Task name prefix of corresponding device name.
   std::vector<string> task_names;
   // True if every task has the same number of devices.
-  bool same_num_devices_per_task;
+  bool same_num_devices_per_task = false;
   CollImplDetails impl_details;
   string ToString() const;
   CollInstanceParams& operator=(const struct CollInstanceParams& other);
@@ -99,9 +99,9 @@ struct CollectiveParams {
   CollInstanceParams instance;
   CollTaskParams task;
 
-  string name;       // node name used only for log or error messages
-  int default_rank;  // index of this op within device_names
-  bool is_source;    // broadcast only
+  string name;             // node name used only for log or error messages
+  int default_rank;        // index of this op within device_names
+  bool is_source = false;  // broadcast only
   // Rank of this device in each subdivision permutation.
   std::vector<int> subdiv_rank;
   std::unique_ptr<OpKernel> merge_op;  // reduction only
-- 
GitLab


From 6d57bca02b3278e812658fe5514a2bcb17670dbe Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 02:53:01 -0700
Subject: [PATCH 299/450] Fix dilated bound calculation in window util for size
 0

Previusly the logic calculated incorrect bounds for the case where the
base bond is 0 causing issues with 0 sized base dilated convolutions.

PiperOrigin-RevId: 193896380
---
 tensorflow/compiler/xla/window_util.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index 93284b80f9..f11123ca24 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -199,6 +199,9 @@ bool IsInactiveWindowDimension(const Window& window, int64 logical_dim) {
 int64 DilatedBound(int64 bound, int64 dilation) {
   CHECK_GE(bound, 0);
   CHECK_GE(dilation, 1);
+  if (bound == 0) {
+    return 0;
+  }
 
   // Suppose the array has three entries 123 and the dilation factor is 4. Then
   // the dilated array has 9 entries 1xxx2xxx3. Here, each original entry except
@@ -212,7 +215,7 @@ int64 StridedBound(int64 bound, int64 window_size, int64 stride) {
   CHECK_GE(bound, 0);
   CHECK_GE(stride, 1);
 
-  if (window_size > bound) {
+  if (bound == 0 || window_size > bound) {
     return 0;
   }
 
-- 
GitLab


From a821ea02afd05a96dd0e118e6ee745d472c61b3e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 06:55:23 -0700
Subject: [PATCH 300/450] Support non-equal set sizes for FID computation.

PiperOrigin-RevId: 193917167
---
 .../eval/python/classifier_metrics_impl.py    | 30 ++++++++++---------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
index 47e51415fd..d914f54945 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
@@ -488,25 +488,25 @@ def frechet_classifier_distance(real_images,
     The Frechet Inception distance. A floating-point scalar of the same type
     as the output of `classifier_fn`.
   """
-
   real_images_list = array_ops.split(
       real_images, num_or_size_splits=num_batches)
   generated_images_list = array_ops.split(
       generated_images, num_or_size_splits=num_batches)
 
-  imgs = array_ops.stack(real_images_list + generated_images_list)
+  real_imgs = array_ops.stack(real_images_list)
+  generated_imgs = array_ops.stack(generated_images_list)
 
   # Compute the activations using the memory-efficient `map_fn`.
-  activations = functional_ops.map_fn(
-      fn=classifier_fn,
-      elems=imgs,
-      parallel_iterations=1,
-      back_prop=False,
-      swap_memory=True,
-      name='RunClassifier')
+  def compute_activations(elems):
+    return functional_ops.map_fn(fn=classifier_fn,
+                                 elems=elems,
+                                 parallel_iterations=1,
+                                 back_prop=False,
+                                 swap_memory=True,
+                                 name='RunClassifier')
 
-  # Split the activations by the real and generated images.
-  real_a, gen_a = array_ops.split(activations, [num_batches, num_batches], 0)
+  real_a = compute_activations(real_imgs)
+  gen_a = compute_activations(generated_imgs)
 
   # Ensure the activations have the right shapes.
   real_a = array_ops.concat(array_ops.unstack(real_a), 0)
@@ -697,18 +697,20 @@ def frechet_classifier_distance_from_activations(real_activations,
   # Compute mean and covariance matrices of activations.
   m = math_ops.reduce_mean(real_activations, 0)
   m_w = math_ops.reduce_mean(generated_activations, 0)
-  num_examples = math_ops.to_double(array_ops.shape(real_activations)[0])
+  num_examples_real = math_ops.to_double(array_ops.shape(real_activations)[0])
+  num_examples_generated = math_ops.to_double(
+      array_ops.shape(generated_activations)[0])
 
   # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T
   real_centered = real_activations - m
   sigma = math_ops.matmul(
       real_centered, real_centered, transpose_a=True) / (
-          num_examples - 1)
+          num_examples_real - 1)
 
   gen_centered = generated_activations - m_w
   sigma_w = math_ops.matmul(
       gen_centered, gen_centered, transpose_a=True) / (
-          num_examples - 1)
+          num_examples_generated - 1)
 
   # Find the Tr(sqrt(sigma sigma_w)) component of FID
   sqrt_trace_component = trace_sqrt_product(sigma, sigma_w)
-- 
GitLab


From c45ffa87d3c7a74a32fcce5c9cebb2a30a2980ab Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 07:36:37 -0700
Subject: [PATCH 301/450] Automated g4 rollback of changelist 193234819

PiperOrigin-RevId: 193921660
---
 .../ci_build/windows/bazel/bazel_test_lib.sh  |  7 +++++
 .../windows/cpu/pip/build_tf_windows.sh       | 26 +++++++++++++++----
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index d654b433e7..582188fc00 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -140,6 +140,13 @@ function run_configure_for_gpu_build {
   echo "" | ./configure
 }
 
+function set_gcs_remote_cache_options {
+  echo "build --experimental_remote_spawn_cache" >> "${TMP_BAZELRC}"
+  echo "build --experimental_remote_platform_override='properties:{name:\"build\" value:\"windows-x64\"}'" >> "${TMP_BAZELRC}"
+  echo "build --remote_http_cache=https://storage.googleapis.com/$GCS_BUCKET_NAME" >> "${TMP_BAZELRC}"
+  echo "build --google_credentials=$GOOGLE_CLOUD_CREDENTIAL" >> "${TMP_BAZELRC}"
+}
+
 function create_python_test_dir() {
   rm -rf "$1"
   mkdir -p "$1"
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 5e9ae497e1..8b7495b3b8 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -42,20 +42,36 @@ source "tensorflow/tools/ci_build/windows/bazel/common_env.sh" \
 source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
   || { echo "Failed to source bazel_test_lib.sh" >&2; exit 1; }
 
+# Recreate an empty bazelrc file under source root
+export TMP_BAZELRC=.tmp.bazelrc
+rm -f "${TMP_BAZELRC}"
+touch "${TMP_BAZELRC}"
+
+function cleanup {
+  # Remove all options in .tmp.bazelrc
+  echo "" > "${TMP_BAZELRC}"
+}
+trap cleanup EXIT
+
 skip_test=0
 
 for ARG in "$@"; do
   if [[ "$ARG" == --skip_test ]]; then
     skip_test=1
+  elif [[ "$ARG" == --enable_gcs_remote_cache ]]; then
+    set_gcs_remote_cache_options
   fi
 done
 
-run_configure_for_cpu_build
-
 # --define=override_eigen_strong_inline=true speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
 # by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521
-BUILD_OPTS="--define=override_eigen_strong_inline=true"
-bazel build -c opt $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
+echo "build --define=override_eigen_strong_inline=true" >> "${TMP_BAZELRC}"
+
+echo "import %workspace%/${TMP_BAZELRC}" >> .bazelrc
+
+run_configure_for_cpu_build
+
+bazel build --announce_rc -c opt tensorflow/tools/pip_package:build_pip_package || exit $?
 
 if [[ "$skip_test" == 1 ]]; then
   exit 0
@@ -73,7 +89,7 @@ reinstall_tensorflow_pip ${PIP_NAME}
 
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
-bazel test -c opt $BUILD_OPTS -k --test_output=errors \
+bazel test -c opt -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_oss \
   --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \
-- 
GitLab


From 9a39d4890da10545f326cf4180d758f2d7c2a3bb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 08:27:07 -0700
Subject: [PATCH 302/450] Adds functionality to subsample the inputs to extract
 image patches. Add functionality to subsample the extracted image patches
 based on the number of the outer products per entry of the covariance matrix.

PiperOrigin-RevId: 193927804
---
 .../kernel_tests/fisher_factors_test.py       |  15 +++
 tensorflow/contrib/kfac/python/ops/BUILD      |   3 +
 .../contrib/kfac/python/ops/fisher_factors.py | 109 +++++++++++++++++-
 3 files changed, 126 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
index 2a3592c53f..432b67e569 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
@@ -814,6 +814,21 @@ class ConvInputKroneckerFactorTest(ConvFactorTestCase):
       new_cov = sess.run(factor.make_covariance_update_op(0.))
       self.assertAllClose([[(1. + 4.) / 2.]], new_cov)
 
+  def testSubSample(self):
+    with tf_ops.Graph().as_default():
+      patches_1 = array_ops.constant(1, shape=(10, 2))
+      patches_2 = array_ops.constant(1, shape=(10, 8))
+      patches_3 = array_ops.constant(1, shape=(3, 3))
+      patches_1_sub = ff._subsample_for_cov_computation(patches_1)
+      patches_2_sub = ff._subsample_for_cov_computation(patches_2)
+      patches_3_sub = ff._subsample_for_cov_computation(patches_3)
+      patches_1_sub_batch_size = patches_1_sub.shape.as_list()[0]
+      patches_2_sub_batch_size = patches_2_sub.shape.as_list()[0]
+      patches_3_sub_batch_size = patches_3_sub.shape.as_list()[0]
+      self.assertEqual(2, patches_1_sub_batch_size)
+      self.assertEqual(8, patches_2_sub_batch_size)
+      self.assertEqual(3, patches_3_sub_batch_size)
+
 
 class ConvOutputKroneckerFactorTest(ConvFactorTestCase):
 
diff --git a/tensorflow/contrib/kfac/python/ops/BUILD b/tensorflow/contrib/kfac/python/ops/BUILD
index b897fd68a0..cb0917bb85 100644
--- a/tensorflow/contrib/kfac/python/ops/BUILD
+++ b/tensorflow/contrib/kfac/python/ops/BUILD
@@ -37,10 +37,13 @@ py_library(
     deps = [
         ":utils",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
         "//tensorflow/python:special_math_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_factors.py b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
index 0d40d265a1..b2da13db89 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_factors.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -55,6 +56,22 @@ EIGENVALUE_DECOMPOSITION_THRESHOLD = 2
 # matrix powers. Must be nonnegative.
 EIGENVALUE_CLIPPING_THRESHOLD = 0.0
 
+# Used to subsample the flattened extracted image patches. The number of
+# outer products per row of the covariance matrix should not exceed this
+# value. This parameter is used only if `_SUB_SAMPLE_OUTER_PRODUCTS` is True.
+_MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW = 1
+
+# Used to subsample the inputs passed to the extract image patches. The batch
+# size of number of inputs to extract image patches is multiplied by this
+# factor. This parameter is used only if `_SUB_SAMPLE_INPUTS` is True.
+_INPUTS_TO_EXTRACT_PATCHES_FACTOR = 0.5
+
+# If True, then subsamples the tensor passed to compute the covaraince matrix.
+_SUB_SAMPLE_OUTER_PRODUCTS = False
+
+# If True, then subsamples the tensor passed to compute the covaraince matrix.
+_SUB_SAMPLE_INPUTS = False
+
 # TOWER_STRATEGY can be one of "concat" or "separate".  If "concat", the data
 # passed to the factors from the blocks will be concatenated across towers
 # (lazilly via PartitionedTensor objects).  Otherwise a tuple of tensors over
@@ -67,12 +84,20 @@ def set_global_constants(init_covariances_at_zero=None,
                          zero_debias=None,
                          eigenvalue_decomposition_threshold=None,
                          eigenvalue_clipping_threshold=None,
+                         max_num_outer_products_per_cov_row=None,
+                         sub_sample_outer_products=None,
+                         inputs_to_extract_ptaches_factor=None,
+                         sub_sample_inputs=None,
                          tower_strategy=None):
   """Sets various global constants used by the classes in this module."""
   global INIT_COVARIANCES_AT_ZERO
   global ZERO_DEBIAS
   global EIGENVALUE_DECOMPOSITION_THRESHOLD
   global EIGENVALUE_CLIPPING_THRESHOLD
+  global _MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW
+  global _SUB_SAMPLE_OUTER_PRODUCTS
+  global _INPUTS_TO_EXTRACT_PATCHES_FACTOR
+  global _SUB_SAMPLE_INPUTS
   global TOWER_STRATEGY
 
   if init_covariances_at_zero is not None:
@@ -83,6 +108,14 @@ def set_global_constants(init_covariances_at_zero=None,
     EIGENVALUE_DECOMPOSITION_THRESHOLD = eigenvalue_decomposition_threshold
   if eigenvalue_clipping_threshold is not None:
     EIGENVALUE_CLIPPING_THRESHOLD = eigenvalue_clipping_threshold
+  if max_num_outer_products_per_cov_row is not None:
+    _MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW = max_num_outer_products_per_cov_row
+  if sub_sample_outer_products is not None:
+    _SUB_SAMPLE_OUTER_PRODUCTS = sub_sample_outer_products
+  if inputs_to_extract_ptaches_factor is not None:
+    _INPUTS_TO_EXTRACT_PATCHES_FACTOR = inputs_to_extract_ptaches_factor
+  if sub_sample_inputs is not None:
+    _SUB_SAMPLE_INPUTS = sub_sample_inputs
   if tower_strategy is not None:
     TOWER_STRATEGY = tower_strategy
 
@@ -227,6 +260,58 @@ def graph_func_to_string(func):
   return list_to_string(func.func_id)
 
 
+def _subsample_for_cov_computation(array, name=None):
+  """Subsamples the first dimension of the array.
+
+  `array`(A) is a tensor of shape `[batch_size, dim_2]`. Then the covariance
+  matrix(A^TA) is of shape `dim_2 ** 2`. Subsample only if the number of outer
+  products per row of the covariance matrix is greater than
+  `_MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW`.
+
+  Args:
+    array: Tensor, of shape `[batch_size, dim_2]`.
+    name: `string`, Default(None)
+
+  Returns:
+    A tensor of shape `[max_samples, dim_2]`.
+
+  Raises:
+    ValueError: If array's is not matrix-shaped.
+    ValueError: If array's batch_size cannot be inferred.
+
+  """
+  with tf_ops.name_scope(name, "subsample", [array]):
+    array = tf_ops.convert_to_tensor(array)
+    if len(array.shape) != 2:
+      raise ValueError("Input param array must be a matrix.")
+
+    batch_size = array.shape.as_list()[0]
+    if batch_size is None:
+      raise ValueError("Unable to get batch_size from input param array.")
+
+    num_cov_rows = array.shape.as_list()[-1]
+    max_batch_size = int(_MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW * num_cov_rows)
+    if batch_size <= max_batch_size:
+      return array
+
+    return _random_tensor_gather(array, max_batch_size)
+
+
+def _random_tensor_gather(array, max_size):
+  """Generates a random set of indices and gathers the value at the indcices.
+
+  Args:
+    array: Tensor, of shape `[batch_size, dim_2]`.
+    max_size: int, Number of indices to sample.
+
+  Returns:
+    A tensor of shape `[max_size, ...]`.
+  """
+  batch_size = array.shape.as_list()[0]
+  indices = random_ops.random_shuffle(math_ops.range(0, batch_size))[:max_size]
+  return array_ops.gather(array, indices)
+
+
 @six.add_metaclass(abc.ABCMeta)
 class FisherFactor(object):
   """Base class for objects modeling factors of approximate Fisher blocks.
@@ -1153,7 +1238,9 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
                dilation_rate=None,
                data_format=None,
                extract_patches_fn=None,
-               has_bias=False):
+               has_bias=False,
+               sub_sample_inputs=None,
+               sub_sample_patches=None):
     """Initializes ConvInputKroneckerFactor.
 
     Args:
@@ -1173,6 +1260,10 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
         patches. One of "extract_convolution_patches", "extract_image_patches",
         "extract_pointwise_conv2d_patches".
       has_bias: bool. If True, append 1 to in_channel.
+      sub_sample_inputs: `bool`. If True, then subsample the inputs from which
+        the image patches are extracted. (Default: None)
+      sub_sample_patches: `bool`, If `True` then subsample the extracted
+        patches.(Default: None)
     """
     self._inputs = inputs
     self._filter_shape = filter_shape
@@ -1182,7 +1273,15 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
     self._data_format = data_format
     self._extract_patches_fn = extract_patches_fn
     self._has_bias = has_bias
+    if sub_sample_inputs is None:
+      self._sub_sample_inputs = _SUB_SAMPLE_INPUTS
+    else:
+      self._sub_sample_inputs = sub_sample_inputs
 
+    if sub_sample_patches is None:
+      self._sub_sample_patches = _SUB_SAMPLE_OUTER_PRODUCTS
+    else:
+      self._sub_sample_patches = sub_sample_patches
     super(ConvInputKroneckerFactor, self).__init__()
 
   @property
@@ -1215,6 +1314,10 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
     assert source == 0
 
     inputs = self._inputs[tower]
+    if self._sub_sample_inputs:
+      batch_size = inputs.shape.as_list()[0]
+      max_size = int(batch_size * _INPUTS_TO_EXTRACT_PATCHES_FACTOR)
+      inputs = _random_tensor_gather(inputs, max_size)
 
     # TODO(b/64144716): there is potential here for a big savings in terms of
     # memory use.
@@ -1260,8 +1363,12 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
     # |Delta| = number of spatial offsets, and J = number of input maps
     # for convolutional layer l.
     patches_flat = array_ops.reshape(patches, [-1, flatten_size])
+
     # We append a homogenous coordinate to patches_flat if the layer has
     # bias parameters. This gives us [[A_l]]_H from the paper.
+    if self._sub_sample_patches:
+      patches_flat = _subsample_for_cov_computation(patches_flat)
+
     if self._has_bias:
       patches_flat = append_homog(patches_flat)
     # We call compute_cov without passing in a normalizer. compute_cov uses
-- 
GitLab


From fb7ce0375c325fc948b68126082b24bb0486c6a9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 08:43:18 -0700
Subject: [PATCH 303/450] Internal Change

PiperOrigin-RevId: 193929733
---
 tensorflow/compiler/aot/test.cc               |  1 +
 tensorflow/compiler/xla/service/backend.cc    |  1 +
 tensorflow/compiler/xla/shape_util.h          |  1 +
 .../xla/tests/local_client_test_base.cc       |  2 +-
 .../factorization/kernels/clustering_ops.cc   |  1 +
 .../contrib/ffmpeg/default/ffmpeg_lib.cc      |  2 +-
 tensorflow/core/BUILD                         |  6 ++-
 .../core/common_runtime/direct_session.cc     |  2 +-
 .../kernel_benchmark_testlib.cc               |  1 +
 .../core/common_runtime/local_device.cc       |  1 +
 .../core/common_runtime/process_util.cc       |  1 +
 tensorflow/core/framework/bfloat16.h          |  1 +
 tensorflow/core/grappler/clusters/utils.cc    |  1 +
 tensorflow/core/grappler/costs/utils.cc       |  2 +-
 tensorflow/core/grappler/devices.cc           |  1 +
 .../grappler/optimizers/constant_folding.cc   |  1 +
 .../adaptive_shared_batch_scheduler.h         |  1 +
 .../batching_util/shared_batch_scheduler.h    |  1 +
 tensorflow/core/kernels/cast_op.h             |  2 +-
 tensorflow/core/kernels/decode_raw_op.cc      |  2 +-
 .../core/kernels/mkl_input_conversion_op.cc   |  1 +
 tensorflow/core/kernels/mkl_tfconv_op.h       |  1 +
 tensorflow/core/kernels/sparse_matmul_op.h    |  1 +
 tensorflow/core/lib/bfloat16/bfloat16.h       |  3 +-
 tensorflow/core/lib/core/coding.cc            |  2 +-
 tensorflow/core/lib/core/raw_coding.h         |  2 +-
 tensorflow/core/lib/gtl/inlined_vector.h      |  2 +-
 tensorflow/core/lib/png/png_io.cc             |  2 +-
 tensorflow/core/lib/wav/wav_io.cc             |  2 +-
 tensorflow/core/platform/byte_order.h         | 37 +++++++++++++++++++
 tensorflow/core/platform/cpu_feature_guard.cc |  1 +
 tensorflow/core/platform/cpu_info.h           |  7 ++--
 tensorflow/core/platform/denormal.cc          |  3 +-
 tensorflow/core/platform/windows/cpu_info.h   |  9 -----
 34 files changed, 76 insertions(+), 28 deletions(-)
 create mode 100644 tensorflow/core/platform/byte_order.h

diff --git a/tensorflow/compiler/aot/test.cc b/tensorflow/compiler/aot/test.cc
index 47ef5f82cb..6b098049cb 100644
--- a/tensorflow/compiler/aot/test.cc
+++ b/tensorflow/compiler/aot/test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 // clang-format on
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index a582dbffd6..b1d616ec35 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 63da9154cf..5fa728e7c2 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index bb5aabb214..b615f0fead 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/contrib/factorization/kernels/clustering_ops.cc b/tensorflow/contrib/factorization/kernels/clustering_ops.cc
index 2a6c97e8b9..025534d540 100644
--- a/tensorflow/contrib/factorization/kernels/clustering_ops.cc
+++ b/tensorflow/contrib/factorization/kernels/clustering_ops.cc
@@ -32,6 +32,7 @@
 #include "tensorflow/core/lib/gtl/top_n.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
index 35341406a0..cca1a05419 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
@@ -28,7 +28,7 @@
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/env.h"
 
 using tensorflow::strings::StrCat;
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 5b04574a4f..a2ff29724b 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -271,7 +271,7 @@ PLATFORM_BASE_HDRS = [
     "platform/logging.h",
     "platform/macros.h",
     "platform/types.h",
-    "platform/cpu_info.h",
+    "platform/byte_order.h",
 ]
 
 PLATFORM_OTHER_HDRS = [
@@ -279,6 +279,7 @@ PLATFORM_OTHER_HDRS = [
     "platform/stacktrace.h",
     "platform/stacktrace_handler.h",
     "platform/context.h",
+    "platform/cpu_info.h",
     "platform/cpu_feature_guard.h",
     "platform/dynamic_annotations.h",
     "platform/env.h",
@@ -307,7 +308,6 @@ cc_library(
     srcs = glob([
         "platform/*/integral_types.h",
         "platform/*/logging.h",
-        "platform/*/cpu_info.h",
     ]),
     hdrs = PLATFORM_BASE_HDRS,
     deps = [
@@ -658,6 +658,7 @@ cc_library(
         "framework/tensor_types.h",
         "framework/type_traits.h",
         "lib/bfloat16/bfloat16.h",
+        "platform/byte_order.h",
         "platform/default/dynamic_annotations.h",
         "platform/default/integral_types.h",
         "platform/default/logging.h",
@@ -1903,6 +1904,7 @@ cc_library(
         "lib/core/casts.h",
         "lib/core/stringpiece.h",
         "lib/png/png_io.h",
+        "platform/byte_order.h",
         "platform/cpu_info.h",
         "platform/default/integral_types.h",
         "platform/default/logging.h",
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 0479061daf..0afbd02e86 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -54,7 +54,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/device_tracer.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index 64d8849475..7de1b80e2d 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test_benchmark.h"
diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc
index ca7f1614f1..873182371e 100644
--- a/tensorflow/core/common_runtime/local_device.cc
+++ b/tensorflow/core/common_runtime/local_device.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_feature_guard.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index 22fd940d82..f8f3a1ecd7 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string.h>
 
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tracing.h"
diff --git a/tensorflow/core/framework/bfloat16.h b/tensorflow/core/framework/bfloat16.h
index 968c18bdd2..2f79d0fa70 100644
--- a/tensorflow/core/framework/bfloat16.h
+++ b/tensorflow/core/framework/bfloat16.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_FRAMEWORK_BFLOAT16_H_
 
 #include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/types.h"
 
 #if defined(PLATFORM_WINDOWS)
diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc
index 50d6e6468f..a7519725a5 100644
--- a/tensorflow/core/grappler/clusters/utils.cc
+++ b/tensorflow/core/grappler/clusters/utils.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/mem.h"
 
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
index f318e3911c..be54d98534 100644
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -44,7 +44,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
diff --git a/tensorflow/core/grappler/devices.cc b/tensorflow/core/grappler/devices.cc
index b318ac22d4..2be894a08b 100644
--- a/tensorflow/core/grappler/devices.cc
+++ b/tensorflow/core/grappler/devices.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/core/grappler/devices.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index e29aaa25fe..45bb188e8d 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/setround.h"
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index 339d792302..f5ced95feb 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/thread_annotations.h"
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index b77289aded..edc88a0384 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/thread_annotations.h"
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
index fd4e75d26f..16d2e0e0a5 100644
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc
index bacacb94ae..eaef5a6097 100644
--- a/tensorflow/core/kernels/decode_raw_op.cc
+++ b/tensorflow/core/kernels/decode_raw_op.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index dcf6bb9f74..ea763ce85b 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/tensor_format.h"
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h
index ddea9e281b..4120f013ac 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl_tfconv_op.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/tensor_format.h"
diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h
index 14ef2ed704..e89280724e 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_matmul_op.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_KERNELS_SPARSE_MATMUL_OP_H_
 
 #include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/types.h"
 
 #if defined(PLATFORM_WINDOWS)
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index 126e5a17af..e7c24387a4 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -19,8 +19,7 @@ limitations under the License.
 #include <cmath>
 #include <complex>
 
-// We need cpu_info.h here in order to pick up __BYTE_ORDER__.
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 
 #ifdef __CUDACC__
 // All functions callable from CUDA code must be qualified with __device__
diff --git a/tensorflow/core/lib/core/coding.cc b/tensorflow/core/lib/core/coding.cc
index bb95c27410..50872eef83 100644
--- a/tensorflow/core/lib/core/coding.cc
+++ b/tensorflow/core/lib/core/coding.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/coding.h"
 
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 
 namespace tensorflow {
 namespace core {
diff --git a/tensorflow/core/lib/core/raw_coding.h b/tensorflow/core/lib/core/raw_coding.h
index bbfd33d303..37201b755d 100644
--- a/tensorflow/core/lib/core/raw_coding.h
+++ b/tensorflow/core/lib/core/raw_coding.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_LIB_CORE_RAW_CODING_H_
 
 #include <string.h>
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/lib/gtl/inlined_vector.h b/tensorflow/core/lib/gtl/inlined_vector.h
index 6e3cb2206d..2011f7d4a1 100644
--- a/tensorflow/core/lib/gtl/inlined_vector.h
+++ b/tensorflow/core/lib/gtl/inlined_vector.h
@@ -43,7 +43,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/lib/gtl/manual_constructor.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index cba473927d..62c803afb2 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/png/png_io.h"
-#include "tensorflow/core/platform/cpu_info.h"  // endian
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/png.h"
 
diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index 51b9c6cd82..3f7dbcee85 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/wav/wav_io.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 
diff --git a/tensorflow/core/platform/byte_order.h b/tensorflow/core/platform/byte_order.h
new file mode 100644
index 0000000000..aab6535e4b
--- /dev/null
+++ b/tensorflow/core/platform/byte_order.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_BYTE_ORDER_H_
+#define TENSORFLOW_CORE_PLATFORM_BYTE_ORDER_H_
+
+// Byte order defines provided by gcc. MSVC doesn't define those so
+// we define them here.
+// We assume that all windows platform out there are little endian.
+#if defined(_MSC_VER) && !defined(__clang__)
+#define __ORDER_LITTLE_ENDIAN__ 0x4d2
+#define __ORDER_BIG_ENDIAN__ 0x10e1
+#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+#endif
+
+namespace tensorflow {
+namespace port {
+
+// TODO(jeff,sanjay): Make portable
+constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
+
+}  // namespace port
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_BYTE_ORDER_H_
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index b570658158..9d00aa7b7f 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <mutex>
 #include <string>
 
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index bb77650e26..b5be7e8b54 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -18,6 +18,10 @@ limitations under the License.
 
 #include <string>
 
+// TODO(ahentz): This is not strictly required here but, for historical
+// reasons, many people depend on cpu_info.h in order to use kLittleEndian.
+#include "tensorflow/core/platform/byte_order.h"
+
 #if defined(_MSC_VER)
 #include "tensorflow/core/platform/windows/cpu_info.h"
 #endif
@@ -25,9 +29,6 @@ limitations under the License.
 namespace tensorflow {
 namespace port {
 
-// TODO(jeff,sanjay): Make portable
-constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
-
 // Returns an estimate of the number of schedulable CPUs for this
 // process.  Usually, it's constant throughout the lifetime of a
 // process, but it might change if the underlying cluster management
diff --git a/tensorflow/core/platform/denormal.cc b/tensorflow/core/platform/denormal.cc
index 82cbc43b4f..c510dc204f 100644
--- a/tensorflow/core/platform/denormal.cc
+++ b/tensorflow/core/platform/denormal.cc
@@ -15,8 +15,9 @@ limitations under the License.
 
 #include <tuple>
 
-#include "tensorflow/core/platform/denormal.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/platform.h"
 // If we're on gcc 4.8 or older, there's a known bug that prevents the use of
diff --git a/tensorflow/core/platform/windows/cpu_info.h b/tensorflow/core/platform/windows/cpu_info.h
index f20939d3c0..ba2126abcf 100644
--- a/tensorflow/core/platform/windows/cpu_info.h
+++ b/tensorflow/core/platform/windows/cpu_info.h
@@ -19,13 +19,4 @@ limitations under the License.
 // included so __cpuidex function is available for GETCPUID on Windows
 #include <intrin.h>
 
-// Byte order defines provided by gcc. MSVC doesn't define those so
-// we define them here.
-// We assume that all windows platform out there are little endian.
-#if defined(_MSC_VER) && !defined(__clang__)
-#define __ORDER_LITTLE_ENDIAN__ 0x4d2
-#define __ORDER_BIG_ENDIAN__ 0x10e1
-#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
-#endif
-
 #endif  // TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
-- 
GitLab


From 26ff316f49e613a7f9cba02dd5e7d6cd5aa78623 Mon Sep 17 00:00:00 2001
From: Pavithra Vijay <psv@google.com>
Date: Mon, 23 Apr 2018 11:03:13 -0700
Subject: [PATCH 304/450] Fix flaky stateful metrics test

PiperOrigin-RevId: 193951580
---
 .../keras/_impl/keras/engine/network.py       |   2 +-
 .../python/keras/_impl/keras/metrics_test.py  | 129 +++++++++---------
 2 files changed, 66 insertions(+), 65 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/engine/network.py b/tensorflow/python/keras/_impl/keras/engine/network.py
index cc177c14a8..3b419dff3a 100644
--- a/tensorflow/python/keras/_impl/keras/engine/network.py
+++ b/tensorflow/python/keras/_impl/keras/engine/network.py
@@ -126,7 +126,7 @@ class Network(base_layer.Layer):
     else:
       self.outputs = [outputs]
 
-    # User-prodived argument validation.
+    # User-provided argument validation.
     if context.executing_eagerly():
       # Check that all inputs/outputs are DeferredTensors.
       for tensor in self.inputs:
diff --git a/tensorflow/python/keras/_impl/keras/metrics_test.py b/tensorflow/python/keras/_impl/keras/metrics_test.py
index 9deaab0c05..13cef97812 100644
--- a/tensorflow/python/keras/_impl/keras/metrics_test.py
+++ b/tensorflow/python/keras/_impl/keras/metrics_test.py
@@ -75,74 +75,75 @@ class KerasMetricsTest(test.TestCase):
       self.assertEqual(result, 0.)
 
   def test_stateful_metrics(self):
-    np.random.seed(1334)
-
-    class BinaryTruePositives(keras.layers.Layer):
-      """Stateful Metric to count the total true positives over all batches.
-
-      Assumes predictions and targets of shape `(samples, 1)`.
-
-      Arguments:
-          threshold: Float, lower limit on prediction value that counts as a
-              positive class prediction.
-          name: String, name for the metric.
-      """
-
-      def __init__(self, name='true_positives', **kwargs):
-        super(BinaryTruePositives, self).__init__(name=name, **kwargs)
-        self.true_positives = keras.backend.variable(value=0, dtype='int32')
-
-      def reset_states(self):
-        keras.backend.set_value(self.true_positives, 0)
+    with self.test_session():
+      np.random.seed(1334)
 
-      def __call__(self, y_true, y_pred):
-        """Computes the number of true positives in a batch.
+      class BinaryTruePositives(keras.layers.Layer):
+        """Stateful Metric to count the total true positives over all batches.
 
-        Args:
-            y_true: Tensor, batch_wise labels
-            y_pred: Tensor, batch_wise predictions
+        Assumes predictions and targets of shape `(samples, 1)`.
 
-        Returns:
-            The total number of true positives seen this epoch at the
-                completion of the batch.
+        Arguments:
+            threshold: Float, lower limit on prediction value that counts as a
+                positive class prediction.
+            name: String, name for the metric.
         """
-        y_true = math_ops.cast(y_true, 'int32')
-        y_pred = math_ops.cast(math_ops.round(y_pred), 'int32')
-        correct_preds = math_ops.cast(math_ops.equal(y_pred, y_true), 'int32')
-        true_pos = math_ops.cast(
-            math_ops.reduce_sum(correct_preds * y_true), 'int32')
-        current_true_pos = self.true_positives * 1
-        self.add_update(
-            state_ops.assign_add(self.true_positives, true_pos),
-            inputs=[y_true, y_pred])
-        return current_true_pos + true_pos
-
-    metric_fn = BinaryTruePositives()
-    config = keras.metrics.serialize(metric_fn)
-    metric_fn = keras.metrics.deserialize(
-        config, custom_objects={'BinaryTruePositives': BinaryTruePositives})
-
-    # Test on simple model
-    inputs = keras.Input(shape=(2,))
-    outputs = keras.layers.Dense(1, activation='sigmoid')(inputs)
-    model = keras.Model(inputs, outputs)
-    model.compile(optimizer='sgd',
-                  loss='binary_crossentropy',
-                  metrics=['acc', metric_fn])
-
-    # Test fit, evaluate
-    samples = 1000
-    x = np.random.random((samples, 2))
-    y = np.random.randint(2, size=(samples, 1))
-    model.fit(x, y, epochs=1, batch_size=10)
-    outs = model.evaluate(x, y, batch_size=10)
-    preds = model.predict(x)
-
-    def ref_true_pos(y_true, y_pred):
-      return np.sum(np.logical_and(y_pred > 0.5, y_true == 1))
-
-    # Test correctness (e.g. updates should have been run)
-    self.assertAllClose(outs[2], ref_true_pos(y, preds), atol=1e-5)
+
+        def __init__(self, name='true_positives', **kwargs):
+          super(BinaryTruePositives, self).__init__(name=name, **kwargs)
+          self.true_positives = keras.backend.variable(value=0, dtype='int32')
+
+        def reset_states(self):
+          keras.backend.set_value(self.true_positives, 0)
+
+        def __call__(self, y_true, y_pred):
+          """Computes the number of true positives in a batch.
+
+          Args:
+              y_true: Tensor, batch_wise labels
+              y_pred: Tensor, batch_wise predictions
+
+          Returns:
+              The total number of true positives seen this epoch at the
+                  completion of the batch.
+          """
+          y_true = math_ops.cast(y_true, 'int32')
+          y_pred = math_ops.cast(math_ops.round(y_pred), 'int32')
+          correct_preds = math_ops.cast(math_ops.equal(y_pred, y_true), 'int32')
+          true_pos = math_ops.cast(
+              math_ops.reduce_sum(correct_preds * y_true), 'int32')
+          current_true_pos = self.true_positives * 1
+          self.add_update(
+              state_ops.assign_add(self.true_positives, true_pos),
+              inputs=[y_true, y_pred])
+          return current_true_pos + true_pos
+
+      metric_fn = BinaryTruePositives()
+      config = keras.metrics.serialize(metric_fn)
+      metric_fn = keras.metrics.deserialize(
+          config, custom_objects={'BinaryTruePositives': BinaryTruePositives})
+
+      # Test on simple model
+      inputs = keras.Input(shape=(2,))
+      outputs = keras.layers.Dense(1, activation='sigmoid')(inputs)
+      model = keras.Model(inputs, outputs)
+      model.compile(optimizer='sgd',
+                    loss='binary_crossentropy',
+                    metrics=['acc', metric_fn])
+
+      # Test fit, evaluate
+      samples = 1000
+      x = np.random.random((samples, 2))
+      y = np.random.randint(2, size=(samples, 1))
+      model.fit(x, y, epochs=1, batch_size=10)
+      outs = model.evaluate(x, y, batch_size=10)
+      preds = model.predict(x)
+
+      def ref_true_pos(y_true, y_pred):
+        return np.sum(np.logical_and(y_pred > 0.5, y_true == 1))
+
+      # Test correctness (e.g. updates should have been run)
+      self.assertAllClose(outs[2], ref_true_pos(y, preds), atol=1e-5)
 
 
 if __name__ == '__main__':
-- 
GitLab


From f0d5d2047833c7221ce3be1690689ca1c6658add Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 11:23:01 -0700
Subject: [PATCH 305/450] Convert int -> size_t so that implicit conversion
 doesn't lose integer precision.

PiperOrigin-RevId: 193955175
---
 tensorflow/contrib/lite/context.h               |  6 +++---
 tensorflow/contrib/lite/interpreter.cc          | 13 +++++++++----
 tensorflow/contrib/lite/interpreter.h           | 12 ++++++------
 tensorflow/contrib/lite/interpreter_test.cc     |  8 ++++----
 tensorflow/contrib/lite/optional_debug_tools.cc |  2 +-
 5 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/tensorflow/contrib/lite/context.h b/tensorflow/contrib/lite/context.h
index 0b38f43cd3..12841d233c 100644
--- a/tensorflow/contrib/lite/context.h
+++ b/tensorflow/contrib/lite/context.h
@@ -275,7 +275,7 @@ typedef struct {
 
 typedef struct TfLiteContext {
   // Number of tensors in the context.
-  int tensors_size;
+  size_t tensors_size;
 
   // The execution plan contains a list of the node indices in execution
   // order. execution_plan->size is the current number of nodes. And,
@@ -397,13 +397,13 @@ typedef struct _TfLiteDelegate {
   // This can be null if the delegate doesn't use its own buffer.
   TfLiteStatus (*CopyFromBufferHandle)(TfLiteDelegate* delegate,
                                        TfLiteBufferHandle buffer_handle,
-                                       void* data, int size);
+                                       void* data, size_t size);
 
   // Copy the data from raw memory to delegate buffer handle.
   // This can be null if the delegate doesn't use its own buffer.
   TfLiteStatus (*CopyToBufferHandle)(TfLiteDelegate* delegate,
                                      TfLiteBufferHandle buffer_handle,
-                                     void* data, int size);
+                                     void* data, size_t size);
 
   // Free the Delegate Buffer Handle. Note: This only frees the handle, but
   // this doesn't release the underlying resource (e.g. textures). The
diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index 91b6c414bf..9d8ea55fd1 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -308,7 +308,12 @@ TfLiteStatus Interpreter::CheckTensorIndices(const char* label,
 
   for (int i = 0; i < length; i++) {
     int index = indices[i];
-    if (index < kOptionalTensor || index >= context_.tensors_size) {
+    // Continue if index == kOptionalTensor before additional comparisons below,
+    // size_t(-1) is always >= context_tensors_size.
+    if (index == kOptionalTensor) {
+      continue;
+    }
+    if (index < 0 || static_cast<size_t>(index) >= context_.tensors_size) {
       ReportError(&context_, "Invalid tensor index %d in %s\n", index, label);
       consistent_ = false;
       return kTfLiteError;
@@ -318,7 +323,7 @@ TfLiteStatus Interpreter::CheckTensorIndices(const char* label,
 }
 
 TfLiteStatus Interpreter::BytesRequired(TfLiteType type, const int* dims,
-                                        int dims_size, size_t* bytes) {
+                                        size_t dims_size, size_t* bytes) {
   // TODO(aselle): Check for overflow here using overflow.h in TensorFlow
   // MultiplyWithoutOverflow.
   TF_LITE_ENSURE(&context_, bytes != nullptr);
@@ -645,7 +650,7 @@ TfLiteStatus Interpreter::GetNodeAndRegistration(
 }
 
 TfLiteStatus Interpreter::SetTensorParametersReadOnly(
-    int tensor_index, TfLiteType type, const char* name, const int rank,
+    int tensor_index, TfLiteType type, const char* name, const size_t rank,
     const int* dims, TfLiteQuantizationParams quantization, const char* buffer,
     size_t bytes, const Allocation* allocation) {
   if (state_ == kStateInvokableAndImmutable) {
@@ -691,7 +696,7 @@ TfLiteStatus Interpreter::SetTensorParametersReadOnly(
 // bytes. The lifetime of buffer must be ensured to be greater or equal
 // to Interpreter.
 TfLiteStatus Interpreter::SetTensorParametersReadWrite(
-    int tensor_index, TfLiteType type, const char* name, const int rank,
+    int tensor_index, TfLiteType type, const char* name, const size_t rank,
     const int* dims, TfLiteQuantizationParams quantization) {
   if (state_ == kStateInvokableAndImmutable) {
     ReportError(
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index a49134b95e..6f3433abcf 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -150,7 +150,7 @@ class Interpreter {
   };
 
   TfLiteStatus SetTensorParametersReadOnly(
-      int tensor_index, TfLiteType type, const char* name, const int rank,
+      int tensor_index, TfLiteType type, const char* name, const size_t rank,
       const int* dims, TfLiteQuantizationParams quantization,
       const char* buffer, size_t bytes, const Allocation* allocation = nullptr);
 
@@ -165,7 +165,7 @@ class Interpreter {
                                         dims.data(), quantization);
   }
   TfLiteStatus SetTensorParametersReadWrite(
-      int tensor_index, TfLiteType type, const char* name, const int rank,
+      int tensor_index, TfLiteType type, const char* name, const size_t rank,
       const int* dims, TfLiteQuantizationParams quantization);
 
   // Functions to access tensor data
@@ -189,10 +189,10 @@ class Interpreter {
   }
 
   // Return the number of tensors in the model.
-  int tensors_size() const { return context_.tensors_size; }
+  size_t tensors_size() const { return context_.tensors_size; }
 
   // Return the number of ops in the model.
-  int nodes_size() const { return nodes_and_registration_.size(); }
+  size_t nodes_size() const { return nodes_and_registration_.size(); }
 
   // WARNING: Experimental interface, subject to change
   const std::vector<int>& execution_plan() const { return execution_plan_; }
@@ -406,7 +406,7 @@ class Interpreter {
   // Compute the number of bytes required to represent a tensor with dimensions
   // specified by the array dims (of length dims_size). Returns the status code
   // and bytes.
-  TfLiteStatus BytesRequired(TfLiteType type, const int* dims, int dims_size,
+  TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size,
                              size_t* bytes);
 
   // Request an tensor be resized implementation. If the given tensor is of
@@ -467,7 +467,7 @@ class Interpreter {
   // tensors. After calling this function, adding `kTensorsCapacityHeadroom`
   // more tensors won't invalidate the pointer to existing tensors.
   void EnsureTensorsVectorCapacity() {
-    const int required_capacity = tensors_size() + kTensorsCapacityHeadroom;
+    const size_t required_capacity = tensors_size() + kTensorsCapacityHeadroom;
     if (required_capacity > tensors_.capacity()) {
       tensors_.reserve(required_capacity);
       context_.tensors = tensors_.data();
diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/contrib/lite/interpreter_test.cc
index 131e088079..453c1ada1c 100644
--- a/tensorflow/contrib/lite/interpreter_test.cc
+++ b/tensorflow/contrib/lite/interpreter_test.cc
@@ -887,15 +887,15 @@ class TestDelegate : public ::testing::Test {
         TfLiteIntArrayFree(nodes_to_separate);
         return kTfLiteOk;
       };
-      delegate_.CopyToBufferHandle = [](TfLiteDelegate* delegate,
-                                        TfLiteBufferHandle buffer_handle,
-                                        void* data, int size) -> TfLiteStatus {
+      delegate_.CopyToBufferHandle =
+          [](TfLiteDelegate* delegate, TfLiteBufferHandle buffer_handle,
+             void* data, size_t size) -> TfLiteStatus {
         // TODO(ycling): Implement tests to test buffer copying logic.
         return kTfLiteOk;
       };
       delegate_.CopyFromBufferHandle =
           [](TfLiteDelegate* delegate, TfLiteBufferHandle buffer_handle,
-             void* data, int size) -> TfLiteStatus {
+             void* data, size_t size) -> TfLiteStatus {
         // TODO(ycling): Implement tests to test buffer copying logic.
         return kTfLiteOk;
       };
diff --git a/tensorflow/contrib/lite/optional_debug_tools.cc b/tensorflow/contrib/lite/optional_debug_tools.cc
index e1366639c7..e0a0910117 100644
--- a/tensorflow/contrib/lite/optional_debug_tools.cc
+++ b/tensorflow/contrib/lite/optional_debug_tools.cc
@@ -72,7 +72,7 @@ const char* AllocTypeName(TfLiteAllocationType type) {
 
 // Prints a dump of what tensors and what nodes are in the interpreter.
 void PrintInterpreterState(Interpreter* interpreter) {
-  printf("Interpreter has %d tensors and %d nodes\n",
+  printf("Interpreter has %lu tensors and %lu nodes\n",
          interpreter->tensors_size(), interpreter->nodes_size());
   printf("Inputs:");
   PrintIntVector(interpreter->inputs());
-- 
GitLab


From 829ec055afdfca3424030794c469d19290df13fe Mon Sep 17 00:00:00 2001
From: Yifei Feng <1192265+yifeif@users.noreply.github.com>
Date: Mon, 23 Apr 2018 11:44:22 -0700
Subject: [PATCH 306/450] Update resources.h

---
 .../core/kernels/boosted_trees/resources.h    | 20 -------------------
 1 file changed, 20 deletions(-)

diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h
index ef42604897..df78d3f275 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.h
+++ b/tensorflow/core/kernels/boosted_trees/resources.h
@@ -82,26 +82,6 @@ class BoostedTreesEnsembleResource : public StampedResource {
 
   int64 GetNumNodes(const int32 tree_id);
 
-  void UpdateLastLayerNodesRange(const int32 node_range_start,
-                                 int32 node_range_end) const {
-    tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_start(
-        node_range_start);
-    tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_end(
-        node_range_end);
-  }
-
-  void GetLastLayerNodesRange(int32* node_range_start,
-                              int32* node_range_end) const {
-    *node_range_start =
-        tree_ensemble_->growing_metadata().last_layer_node_start();
-    *node_range_end = tree_ensemble_->growing_metadata().last_layer_node_end();
-  }
-
-  int64 GetNumNodes(const int32 tree_id) {
-    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
-    return tree_ensemble_->trees(tree_id).nodes_size();
-  }
-
   void UpdateGrowingMetadata() const;
 
   int32 GetNumLayersAttempted();
-- 
GitLab


From d93e09fbd3408f6ee1647addfdca1eef00139223 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 11:42:02 -0700
Subject: [PATCH 307/450] Add fast shuffled fully-connected path also for the
 case where the RHS has 4 columns (so far was only for the GEMV case where RHS
 has 1 column).

Also pre-shuffle / pre-xor the input activations, not just the
weights. We need a buffer for that, so the shuffled FullyConnected
operator gets a second output acting as its workspace, similar
to what we have been doing for Conv operators needed a im2col
workspace buffer.

PiperOrigin-RevId: 193958461
---
 .../internal/optimized/optimized_ops.h        | 448 +++++++++++++-----
 .../internal/reference/reference_ops.h        | 155 ++++--
 .../experimental_shuffle_fc_weights.cc        |  27 +-
 tensorflow/contrib/lite/toco/tooling_util.cc  |  15 +-
 4 files changed, 483 insertions(+), 162 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 2e2721e093..49ce1133d3 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -1209,109 +1209,275 @@ void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
 // as the 'task' for worker threads to run (multi-threaded case, see
 // ExperimentalShuffledFullyConnectedWorkerTask below).
 inline void ExperimentalShuffledFullyConnectedWorkerImpl(
-    const uint8* input_data, const int8* shuffled_weights_data,
-    int output_depth, int accum_depth, const int32* bias_data,
+    const uint8* shuffled_input_workspace_data,
+    const int8* shuffled_weights_data, int batches, int output_depth,
+    int output_stride, int accum_depth, const int32* bias_data,
     int32 output_multiplier, int output_shift, int16* output_data) {
-  const int8* shuffled_weights_ptr = shuffled_weights_data;
 #if defined USE_NEON
-  // We'll only need to xor signbit to the input activation values, as
-  // that xor-ing is pre-built into the shuffled weights values.
-  const uint8x16_t signbit = vdupq_n_u8(0x80);
-  const int right_shift = output_shift > 0 ? output_shift : 0;
-  const int left_shift = output_shift > 0 ? 0 : -output_shift;
-  for (int c = 0; c < output_depth; c += 4) {
-    // Accumulation loop.
-    int32x4_t row_accum0 = vdupq_n_s32(0);
-    int32x4_t row_accum1 = vdupq_n_s32(0);
-    int32x4_t row_accum2 = vdupq_n_s32(0);
-    int32x4_t row_accum3 = vdupq_n_s32(0);
-    for (int d = 0; d < accum_depth; d += 16) {
-      int8x16_t weights0 = vld1q_s8(shuffled_weights_ptr + 0);
-      int8x16_t weights1 = vld1q_s8(shuffled_weights_ptr + 16);
-      int8x16_t weights2 = vld1q_s8(shuffled_weights_ptr + 32);
-      int8x16_t weights3 = vld1q_s8(shuffled_weights_ptr + 48);
-      shuffled_weights_ptr += 64;
-      int8x16_t input =
-          vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(input_data + d)));
-      int16x8_t local_accum0 =
-          vmull_s8(vget_low_s8(weights0), vget_low_s8(input));
-      int16x8_t local_accum1 =
-          vmull_s8(vget_low_s8(weights1), vget_low_s8(input));
-      int16x8_t local_accum2 =
-          vmull_s8(vget_low_s8(weights2), vget_low_s8(input));
-      int16x8_t local_accum3 =
-          vmull_s8(vget_low_s8(weights3), vget_low_s8(input));
-      local_accum0 =
-          vmlal_s8(local_accum0, vget_high_s8(weights0), vget_high_s8(input));
-      local_accum1 =
-          vmlal_s8(local_accum1, vget_high_s8(weights1), vget_high_s8(input));
-      local_accum2 =
-          vmlal_s8(local_accum2, vget_high_s8(weights2), vget_high_s8(input));
-      local_accum3 =
-          vmlal_s8(local_accum3, vget_high_s8(weights3), vget_high_s8(input));
-      row_accum0 = vpadalq_s16(row_accum0, local_accum0);
-      row_accum1 = vpadalq_s16(row_accum1, local_accum1);
-      row_accum2 = vpadalq_s16(row_accum2, local_accum2);
-      row_accum3 = vpadalq_s16(row_accum3, local_accum3);
+  const int8* shuffled_weights_ptr = shuffled_weights_data;
+  if (batches == 1) {
+    const int right_shift = output_shift > 0 ? output_shift : 0;
+    const int left_shift = output_shift > 0 ? 0 : -output_shift;
+    for (int c = 0; c < output_depth; c += 4) {
+      // Accumulation loop.
+      int32x4_t row_accum0 = vdupq_n_s32(0);
+      int32x4_t row_accum1 = vdupq_n_s32(0);
+      int32x4_t row_accum2 = vdupq_n_s32(0);
+      int32x4_t row_accum3 = vdupq_n_s32(0);
+      for (int d = 0; d < accum_depth; d += 16) {
+        int8x16_t weights0 = vld1q_s8(shuffled_weights_ptr + 0);
+        int8x16_t weights1 = vld1q_s8(shuffled_weights_ptr + 16);
+        int8x16_t weights2 = vld1q_s8(shuffled_weights_ptr + 32);
+        int8x16_t weights3 = vld1q_s8(shuffled_weights_ptr + 48);
+        shuffled_weights_ptr += 64;
+        int8x16_t input =
+            vreinterpretq_s8_u8(vld1q_u8(shuffled_input_workspace_data + d));
+        int16x8_t local_accum0 =
+            vmull_s8(vget_low_s8(weights0), vget_low_s8(input));
+        int16x8_t local_accum1 =
+            vmull_s8(vget_low_s8(weights1), vget_low_s8(input));
+        int16x8_t local_accum2 =
+            vmull_s8(vget_low_s8(weights2), vget_low_s8(input));
+        int16x8_t local_accum3 =
+            vmull_s8(vget_low_s8(weights3), vget_low_s8(input));
+        local_accum0 =
+            vmlal_s8(local_accum0, vget_high_s8(weights0), vget_high_s8(input));
+        local_accum1 =
+            vmlal_s8(local_accum1, vget_high_s8(weights1), vget_high_s8(input));
+        local_accum2 =
+            vmlal_s8(local_accum2, vget_high_s8(weights2), vget_high_s8(input));
+        local_accum3 =
+            vmlal_s8(local_accum3, vget_high_s8(weights3), vget_high_s8(input));
+        row_accum0 = vpadalq_s16(row_accum0, local_accum0);
+        row_accum1 = vpadalq_s16(row_accum1, local_accum1);
+        row_accum2 = vpadalq_s16(row_accum2, local_accum2);
+        row_accum3 = vpadalq_s16(row_accum3, local_accum3);
+      }
+      // Horizontally reduce accumulators
+      int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
+          pairwise_reduced_acc_2, pairwise_reduced_acc_3;
+      pairwise_reduced_acc_0 =
+          vpadd_s32(vget_low_s32(row_accum0), vget_high_s32(row_accum0));
+      pairwise_reduced_acc_1 =
+          vpadd_s32(vget_low_s32(row_accum1), vget_high_s32(row_accum1));
+      pairwise_reduced_acc_2 =
+          vpadd_s32(vget_low_s32(row_accum2), vget_high_s32(row_accum2));
+      pairwise_reduced_acc_3 =
+          vpadd_s32(vget_low_s32(row_accum3), vget_high_s32(row_accum3));
+      const int32x2_t reduced_lo =
+          vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+      const int32x2_t reduced_hi =
+          vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+      int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+      // Add bias values.
+      int32x4_t bias_vec = vld1q_s32(bias_data + c);
+      reduced = vaddq_s32(reduced, bias_vec);
+      reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
+      // Multiply by the fixed-point multiplier.
+      reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+      // Rounding-shift-right.
+      using gemmlowp::RoundingDivideByPOT;
+      reduced = RoundingDivideByPOT(reduced, right_shift);
+      // Narrow values down to 16 bit signed.
+      const int16x4_t res16 = vqmovn_s32(reduced);
+      vst1_s16(output_data + c, res16);
     }
-    // Horizontally reduce accumulators
-    int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
-        pairwise_reduced_acc_2, pairwise_reduced_acc_3;
-    pairwise_reduced_acc_0 =
-        vpadd_s32(vget_low_s32(row_accum0), vget_high_s32(row_accum0));
-    pairwise_reduced_acc_1 =
-        vpadd_s32(vget_low_s32(row_accum1), vget_high_s32(row_accum1));
-    pairwise_reduced_acc_2 =
-        vpadd_s32(vget_low_s32(row_accum2), vget_high_s32(row_accum2));
-    pairwise_reduced_acc_3 =
-        vpadd_s32(vget_low_s32(row_accum3), vget_high_s32(row_accum3));
-    const int32x2_t reduced_lo =
-        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
-    const int32x2_t reduced_hi =
-        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
-    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
-    // Add bias values.
-    int32x4_t bias_vec = vld1q_s32(bias_data + c);
-    reduced = vaddq_s32(reduced, bias_vec);
-    reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
-    // Multiply by the fixed-point multiplier.
-    reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
-    // Rounding-shift-right.
-    using gemmlowp::RoundingDivideByPOT;
-    reduced = RoundingDivideByPOT(reduced, right_shift);
-    // Narrow values down to 16 bit signed.
-    const int16x4_t res16 = vqmovn_s32(reduced);
-    vst1_s16(output_data + c, res16);
+  } else if (batches == 4) {
+    const int right_shift = output_shift > 0 ? output_shift : 0;
+    const int left_shift = output_shift > 0 ? 0 : -output_shift;
+    for (int c = 0; c < output_depth; c += 4) {
+      const int8* shuffled_input_ptr =
+          reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+      // Accumulation loop.
+      int32x4_t row_accum00 = vdupq_n_s32(0);
+      int32x4_t row_accum10 = vdupq_n_s32(0);
+      int32x4_t row_accum20 = vdupq_n_s32(0);
+      int32x4_t row_accum30 = vdupq_n_s32(0);
+      int32x4_t row_accum01 = vdupq_n_s32(0);
+      int32x4_t row_accum11 = vdupq_n_s32(0);
+      int32x4_t row_accum21 = vdupq_n_s32(0);
+      int32x4_t row_accum31 = vdupq_n_s32(0);
+      int32x4_t row_accum02 = vdupq_n_s32(0);
+      int32x4_t row_accum12 = vdupq_n_s32(0);
+      int32x4_t row_accum22 = vdupq_n_s32(0);
+      int32x4_t row_accum32 = vdupq_n_s32(0);
+      int32x4_t row_accum03 = vdupq_n_s32(0);
+      int32x4_t row_accum13 = vdupq_n_s32(0);
+      int32x4_t row_accum23 = vdupq_n_s32(0);
+      int32x4_t row_accum33 = vdupq_n_s32(0);
+      for (int d = 0; d < accum_depth; d += 16) {
+        int8x16_t weights0 = vld1q_s8(shuffled_weights_ptr + 0);
+        int8x16_t weights1 = vld1q_s8(shuffled_weights_ptr + 16);
+        int8x16_t weights2 = vld1q_s8(shuffled_weights_ptr + 32);
+        int8x16_t weights3 = vld1q_s8(shuffled_weights_ptr + 48);
+        shuffled_weights_ptr += 64;
+        int8x16_t input0 = vld1q_s8(shuffled_input_ptr + 0);
+        int8x16_t input1 = vld1q_s8(shuffled_input_ptr + 16);
+        int8x16_t input2 = vld1q_s8(shuffled_input_ptr + 32);
+        int8x16_t input3 = vld1q_s8(shuffled_input_ptr + 48);
+        shuffled_input_ptr += 64;
+        int16x8_t local_accum0, local_accum1, local_accum2, local_accum3;
+#define TFLITE_SHUFFLED_FC_ACCUM(B)                                           \
+  local_accum0 = vmull_s8(vget_low_s8(weights0), vget_low_s8(input##B));      \
+  local_accum1 = vmull_s8(vget_low_s8(weights1), vget_low_s8(input##B));      \
+  local_accum2 = vmull_s8(vget_low_s8(weights2), vget_low_s8(input##B));      \
+  local_accum3 = vmull_s8(vget_low_s8(weights3), vget_low_s8(input##B));      \
+  local_accum0 =                                                              \
+      vmlal_s8(local_accum0, vget_high_s8(weights0), vget_high_s8(input##B)); \
+  local_accum1 =                                                              \
+      vmlal_s8(local_accum1, vget_high_s8(weights1), vget_high_s8(input##B)); \
+  local_accum2 =                                                              \
+      vmlal_s8(local_accum2, vget_high_s8(weights2), vget_high_s8(input##B)); \
+  local_accum3 =                                                              \
+      vmlal_s8(local_accum3, vget_high_s8(weights3), vget_high_s8(input##B)); \
+  row_accum0##B = vpadalq_s16(row_accum0##B, local_accum0);                   \
+  row_accum1##B = vpadalq_s16(row_accum1##B, local_accum1);                   \
+  row_accum2##B = vpadalq_s16(row_accum2##B, local_accum2);                   \
+  row_accum3##B = vpadalq_s16(row_accum3##B, local_accum3);
+
+        TFLITE_SHUFFLED_FC_ACCUM(0)
+        TFLITE_SHUFFLED_FC_ACCUM(1)
+        TFLITE_SHUFFLED_FC_ACCUM(2)
+        TFLITE_SHUFFLED_FC_ACCUM(3)
+
+#undef TFLITE_SHUFFLED_FC_ACCUM
+      }
+      // Horizontally reduce accumulators
+
+#define TFLITE_SHUFFLED_FC_STORE(B)                                           \
+  {                                                                           \
+    int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,                 \
+        pairwise_reduced_acc_2, pairwise_reduced_acc_3;                       \
+    pairwise_reduced_acc_0 =                                                  \
+        vpadd_s32(vget_low_s32(row_accum0##B), vget_high_s32(row_accum0##B)); \
+    pairwise_reduced_acc_1 =                                                  \
+        vpadd_s32(vget_low_s32(row_accum1##B), vget_high_s32(row_accum1##B)); \
+    pairwise_reduced_acc_2 =                                                  \
+        vpadd_s32(vget_low_s32(row_accum2##B), vget_high_s32(row_accum2##B)); \
+    pairwise_reduced_acc_3 =                                                  \
+        vpadd_s32(vget_low_s32(row_accum3##B), vget_high_s32(row_accum3##B)); \
+    const int32x2_t reduced_lo =                                              \
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);            \
+    const int32x2_t reduced_hi =                                              \
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);            \
+    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);                 \
+    int32x4_t bias_vec = vld1q_s32(bias_data + c);                            \
+    reduced = vaddq_s32(reduced, bias_vec);                                   \
+    reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));                    \
+    reduced = vqrdmulhq_n_s32(reduced, output_multiplier);                    \
+    using gemmlowp::RoundingDivideByPOT;                                      \
+    reduced = RoundingDivideByPOT(reduced, right_shift);                      \
+    const int16x4_t res16 = vqmovn_s32(reduced);                              \
+    vst1_s16(output_data + c + B * output_stride, res16);                     \
+  }
+
+      TFLITE_SHUFFLED_FC_STORE(0);
+      TFLITE_SHUFFLED_FC_STORE(1);
+      TFLITE_SHUFFLED_FC_STORE(2);
+      TFLITE_SHUFFLED_FC_STORE(3);
+
+#undef TFLITE_SHUFFLED_FC_STORE
+    }
+  } else {
+    TFLITE_DCHECK(false);
+    return;
   }
 #else
-  for (int c = 0; c < output_depth; c += 4) {
-    // Internal accumulation.
-    // Initialize accumulator with the bias-value.
-    int32 accum[4] = {0};
-    // Accumulation loop.
-    for (int d = 0; d < accum_depth; d += 16) {
-      for (int i = 0; i < 4; i++) {
-        for (int j = 0; j < 16; j++) {
-          int8 input_val = input_data[d + j] - 128;
-          int8 weights_val = *shuffled_weights_ptr++;
-          accum[i] += weights_val * input_val;
+  if (batches == 1) {
+    int16* output_ptr = output_data;
+    // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+    // so that just reinterpreting them as int8 values is equivalent to
+    // subtracting 128 from them, thus implementing for free the subtraction of
+    // the zero_point value 128.
+    const int8* shuffled_weights_ptr =
+        reinterpret_cast<const int8*>(shuffled_weights_data);
+    // Likewise, we preshuffled and pre-xored the input data above.
+    const int8* shuffled_input_data =
+        reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+    for (int c = 0; c < output_depth; c += 4) {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32 accum[4] = {0};
+      // Accumulation loop.
+      for (int d = 0; d < accum_depth; d += 16) {
+        for (int i = 0; i < 4; i++) {
+          for (int j = 0; j < 16; j++) {
+            int8 input_val = shuffled_input_data[d + j];
+            int8 weights_val = *shuffled_weights_ptr++;
+            accum[i] += weights_val * input_val;
+          }
         }
       }
+      for (int i = 0; i < 4; i++) {
+        // Add bias value
+        int acc = accum[i] + bias_data[c + i];
+        // Down-scale the final int32 accumulator to the scale used by our
+        // (16-bit, typically 3 integer bits) fixed-point format. The quantized
+        // multiplier and shift here have been pre-computed offline
+        // (e.g. by toco).
+        acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+                                            -output_shift);
+        // Saturate, cast to int16, and store to output array.
+        acc = std::max(acc, -32768);
+        acc = std::min(acc, 32767);
+        output_ptr[c + i] = acc;
+      }
     }
-    for (int i = 0; i < 4; i++) {
-      // Add bias value
-      int acc = accum[i] + bias_data[c + i];
-      // Down-scale the final int32 accumulator to the scale used by our
-      // (16-bit, typically 3 integer bits) fixed-point format. The quantized
-      // multiplier and shift here have been pre-computed offline
-      // (e.g. by toco).
-      acc =
-          MultiplyByQuantizedMultiplier(acc, output_multiplier, -output_shift);
-      // Saturate, cast to int16, and store to output array.
-      acc = std::max(acc, -32768);
-      acc = std::min(acc, 32767);
-      output_data[c + i] = acc;
+  } else if (batches == 4) {
+    int16* output_ptr = output_data;
+    // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+    // so that just reinterpreting them as int8 values is equivalent to
+    // subtracting 128 from them, thus implementing for free the subtraction of
+    // the zero_point value 128.
+    const int8* shuffled_weights_ptr =
+        reinterpret_cast<const int8*>(shuffled_weights_data);
+    // Likewise, we preshuffled and pre-xored the input data above.
+    const int8* shuffled_input_data =
+        reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+    for (int c = 0; c < output_depth; c += 4) {
+      const int8* shuffled_input_ptr = shuffled_input_data;
+      // Accumulation loop.
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32 accum[4][4];
+      for (int i = 0; i < 4; i++) {
+        for (int b = 0; b < 4; b++) {
+          accum[i][b] = 0;
+        }
+      }
+      for (int d = 0; d < accum_depth; d += 16) {
+        for (int i = 0; i < 4; i++) {
+          for (int b = 0; b < 4; b++) {
+            for (int j = 0; j < 16; j++) {
+              int8 input_val = shuffled_input_ptr[16 * b + j];
+              int8 weights_val = shuffled_weights_ptr[16 * i + j];
+              accum[i][b] += weights_val * input_val;
+            }
+          }
+        }
+        shuffled_input_ptr += 64;
+        shuffled_weights_ptr += 64;
+      }
+      for (int i = 0; i < 4; i++) {
+        for (int b = 0; b < 4; b++) {
+          // Add bias value
+          int acc = accum[i][b] + bias_data[c + i];
+          // Down-scale the final int32 accumulator to the scale used by our
+          // (16-bit, typically 3 integer bits) fixed-point format. The
+          // quantized multiplier and shift here have been pre-computed offline
+          // (e.g. by toco).
+          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+                                              -output_shift);
+          // Saturate, cast to int16, and store to output array.
+          acc = std::max(acc, -32768);
+          acc = std::min(acc, 32767);
+          output_ptr[b * output_stride + c + i] = acc;
+        }
+      }
     }
+  } else {
+    TFLITE_DCHECK(false);
+    return;
   }
 #endif
 }
@@ -1320,12 +1486,15 @@ inline void ExperimentalShuffledFullyConnectedWorkerImpl(
 // to allow using gemmlowp's threadpool.
 struct ExperimentalShuffledFullyConnectedWorkerTask : gemmlowp::Task {
   ExperimentalShuffledFullyConnectedWorkerTask(
-      const uint8* input_data, const int8* shuffled_weights_data,
-      int output_depth, int accum_depth, const int32* bias_data,
-      int32 output_multiplier, int output_shift, int16* output_data)
+      const uint8* input_data, const int8* shuffled_weights_data, int batches,
+      int output_depth, int output_stride, int accum_depth,
+      const int32* bias_data, int32 output_multiplier, int output_shift,
+      int16* output_data)
       : input_data_(input_data),
         shuffled_weights_data_(shuffled_weights_data),
+        batches_(batches),
         output_depth_(output_depth),
+        output_stride_(output_stride),
         accum_depth_(accum_depth),
         bias_data_(bias_data),
         output_multiplier_(output_multiplier),
@@ -1334,13 +1503,16 @@ struct ExperimentalShuffledFullyConnectedWorkerTask : gemmlowp::Task {
 
   void Run() override {
     ExperimentalShuffledFullyConnectedWorkerImpl(
-        input_data_, shuffled_weights_data_, output_depth_, accum_depth_,
-        bias_data_, output_multiplier_, output_shift_, output_data_);
+        input_data_, shuffled_weights_data_, batches_, output_depth_,
+        output_stride_, accum_depth_, bias_data_, output_multiplier_,
+        output_shift_, output_data_);
   }
 
   const uint8* input_data_;
   const int8* shuffled_weights_data_;
+  int batches_;
   int output_depth_;
+  int output_stride_;
   int accum_depth_;
   const int32* bias_data_;
   int32 output_multiplier_;
@@ -1354,7 +1526,7 @@ inline void ExperimentalShuffledFullyConnected(
     const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier,
     int output_shift, int32 output_activation_min, int32 output_activation_max,
     int16* output_data, const Dims<4>& output_dims,
-    gemmlowp::GemmContext* gemm_context) {
+    uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) {
   gemmlowp::ScopedProfilingLabel label(
       "ExperimentalShuffledFullyConnected/8bit");
   (void)gemm_context;  // only used in optimized code.
@@ -1371,10 +1543,8 @@ inline void ExperimentalShuffledFullyConnected(
   const int accum_depth = ArraySize(weights_dims, 0);
   TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
-  // The experimental shuffling is an optimization for matrix*vector product.
-  // We aren't interested in supporting non-matrix*vector-product cases, i.e.
-  // batches>1.
-  TFLITE_DCHECK_EQ(batches, 1);
+  TFLITE_DCHECK((accum_depth % 16) == 0);
+  TFLITE_DCHECK((output_depth % 4) == 0);
   // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
   // so that just reinterpreting them as int8 values is equivalent to
   // subtracting 128 from them, thus implementing for free the subtraction of
@@ -1382,18 +1552,71 @@ inline void ExperimentalShuffledFullyConnected(
   const int8* int8_shuffled_weights_data =
       reinterpret_cast<const int8*>(shuffled_weights_data);
 
-  // Our GEMV kernel has 4 rows. This doesn't matter in practice for GEMV
-  // shapes, gemmlowp::HowManyThreads only takes that parameter because it
-  // matters for other kinds of GEMM shapes.
+  // Shuffling and xoring of input activations into the workspace buffer
+  if (batches == 1) {
+#ifdef USE_NEON
+    const uint8x16_t signbit = vdupq_n_u8(0x80);
+    for (int i = 0; i < accum_depth; i += 16) {
+      uint8x16_t val = vld1q_u8(input_data + i);
+      val = veorq_u8(val, signbit);
+      vst1q_u8(shuffled_input_workspace_data + i, val);
+    }
+#else
+    for (int i = 0; i < accum_depth; i++) {
+      shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
+    }
+#endif
+  } else if (batches == 4) {
+    uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
+    int c = 0;
+#ifdef USE_NEON
+    const uint8x16_t signbit = vdupq_n_u8(0x80);
+    for (c = 0; c < accum_depth; c += 16) {
+      const uint8* src_data_ptr = input_data + c;
+      uint8x16_t val0 = vld1q_u8(src_data_ptr + 0 * accum_depth);
+      uint8x16_t val1 = vld1q_u8(src_data_ptr + 1 * accum_depth);
+      uint8x16_t val2 = vld1q_u8(src_data_ptr + 2 * accum_depth);
+      uint8x16_t val3 = vld1q_u8(src_data_ptr + 3 * accum_depth);
+      val0 = veorq_u8(val0, signbit);
+      val1 = veorq_u8(val1, signbit);
+      val2 = veorq_u8(val2, signbit);
+      val3 = veorq_u8(val3, signbit);
+      vst1q_u8(shuffled_input_workspace_ptr + 0, val0);
+      vst1q_u8(shuffled_input_workspace_ptr + 16, val1);
+      vst1q_u8(shuffled_input_workspace_ptr + 32, val2);
+      vst1q_u8(shuffled_input_workspace_ptr + 48, val3);
+      shuffled_input_workspace_ptr += 64;
+    }
+#else
+    for (c = 0; c < accum_depth; c += 16) {
+      for (int b = 0; b < 4; b++) {
+        const uint8* src_data_ptr = input_data + b * accum_depth + c;
+        for (int j = 0; j < 16; j++) {
+          uint8 src_val = *src_data_ptr++;
+          // Flip the sign bit, so that the kernel will only need to
+          // reinterpret these uint8 values as int8, getting for free the
+          // subtraction of the zero_point value 128.
+          uint8 dst_val = src_val ^ 0x80;
+          *shuffled_input_workspace_ptr++ = dst_val;
+        }
+      }
+    }
+#endif
+  } else {
+    TFLITE_DCHECK(false);
+    return;
+  }
+
   static constexpr int kKernelRows = 4;
   const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
-      gemm_context->max_num_threads(), output_depth, 1, accum_depth);
+      gemm_context->max_num_threads(), output_depth, batches, accum_depth);
   if (thread_count == 1) {
     // Single-thread case: do the computation on the current thread, don't
     // use a threadpool
     ExperimentalShuffledFullyConnectedWorkerImpl(
-        input_data, int8_shuffled_weights_data, output_depth, accum_depth,
-        bias_data, output_multiplier, output_shift, output_data);
+        shuffled_input_workspace_data, int8_shuffled_weights_data, batches,
+        output_depth, output_depth, accum_depth, bias_data, output_multiplier,
+        output_shift, output_data);
     return;
   }
 
@@ -1406,8 +1629,9 @@ inline void ExperimentalShuffledFullyConnected(
   for (int i = 0; i < thread_count; i++) {
     int row_end = std::min(output_depth, row_start + kRowsPerWorker);
     tasks[i] = new ExperimentalShuffledFullyConnectedWorkerTask(
-        input_data, int8_shuffled_weights_data + row_start * accum_depth,
-        row_end - row_start, accum_depth, bias_data + row_start,
+        shuffled_input_workspace_data,
+        int8_shuffled_weights_data + row_start * accum_depth, batches,
+        row_end - row_start, output_depth, accum_depth, bias_data + row_start,
         output_multiplier, output_shift, output_data + row_start);
     row_start = row_end;
   }
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 49a93b0c6d..d1d4f54f86 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -608,8 +608,9 @@ inline void ExperimentalShuffledFullyConnected(
     const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier,
     int output_shift, int32 output_activation_min, int32 output_activation_max,
     int16* output_data, const Dims<4>& output_dims,
-    gemmlowp::GemmContext* gemm_context) {
+    uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) {
   (void)gemm_context;  // only used in optimized code.
+
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
   // TODO(benoitjacob): This really should be:
   //     const int batches = ArraySize(output_dims, 1);
@@ -622,44 +623,130 @@ inline void ExperimentalShuffledFullyConnected(
   const int accum_depth = ArraySize(weights_dims, 0);
   TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
-  // The experimental shuffling is an optimization for matrix*vector product.
-  // We aren't interested in supporting non-matrix*vector-product cases, i.e.
-  // batches>1.
-  TFLITE_DCHECK_EQ(batches, 1);
-  // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
-  // so that just reinterpreting them as int8 values is equivalent to
-  // subtracting 128 from them, thus implementing for free the subtraction of
-  // the zero_point value 128.
-  const int8* shuffled_weights_ptr =
-      reinterpret_cast<const int8*>(shuffled_weights_data);
-  for (int c = 0; c < output_depth; c += 4) {
-    // Internal accumulation.
-    // Initialize accumulator with the bias-value.
-    int32 accum[4] = {0};
-    // Accumulation loop.
-    for (int d = 0; d < accum_depth; d += 16) {
-      for (int i = 0; i < 4; i++) {
+  TFLITE_DCHECK((accum_depth % 16) == 0);
+  TFLITE_DCHECK((output_depth % 4) == 0);
+
+  // Shuffling and xoring of input activations into the workspace buffer
+  uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
+  if (batches == 1) {
+    for (int i = 0; i < accum_depth; i++) {
+      shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
+    }
+  } else if (batches == 4) {
+    for (int c = 0; c < accum_depth; c += 16) {
+      for (int b = 0; b < 4; b++) {
+        const uint8* src_data_ptr = input_data + b * accum_depth + c;
         for (int j = 0; j < 16; j++) {
-          int8 input_val = input_data[d + j] - 128;
-          int8 weights_val = *shuffled_weights_ptr++;
-          accum[i] += weights_val * input_val;
+          uint8 src_val = *src_data_ptr++;
+          // Flip the sign bit, so that the kernel will only need to
+          // reinterpret these uint8 values as int8, getting for free the
+          // subtraction of the zero_point value 128.
+          uint8 dst_val = src_val ^ 0x80;
+          *shuffled_input_workspace_ptr++ = dst_val;
         }
       }
     }
-    for (int i = 0; i < 4; i++) {
-      // Add bias value
-      int acc = accum[i] + bias_data[c + i];
-      // Down-scale the final int32 accumulator to the scale used by our
-      // (16-bit, typically 3 integer bits) fixed-point format. The quantized
-      // multiplier and shift here have been pre-computed offline
-      // (e.g. by toco).
-      acc =
-          MultiplyByQuantizedMultiplier(acc, output_multiplier, -output_shift);
-      // Saturate, cast to int16, and store to output array.
-      acc = std::max(acc, output_activation_min);
-      acc = std::min(acc, output_activation_max);
-      output_data[c + i] = acc;
+  } else {
+    TFLITE_DCHECK(false);
+    return;
+  }
+
+  // Actual computation
+  if (batches == 1) {
+    int16* output_ptr = output_data;
+    // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+    // so that just reinterpreting them as int8 values is equivalent to
+    // subtracting 128 from them, thus implementing for free the subtraction of
+    // the zero_point value 128.
+    const int8* shuffled_weights_ptr =
+        reinterpret_cast<const int8*>(shuffled_weights_data);
+    // Likewise, we preshuffled and pre-xored the input data above.
+    const int8* shuffled_input_data =
+        reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+    for (int c = 0; c < output_depth; c += 4) {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32 accum[4] = {0};
+      // Accumulation loop.
+      for (int d = 0; d < accum_depth; d += 16) {
+        for (int i = 0; i < 4; i++) {
+          for (int j = 0; j < 16; j++) {
+            int8 input_val = shuffled_input_data[d + j];
+            int8 weights_val = *shuffled_weights_ptr++;
+            accum[i] += weights_val * input_val;
+          }
+        }
+      }
+      for (int i = 0; i < 4; i++) {
+        // Add bias value
+        int acc = accum[i] + bias_data[c + i];
+        // Down-scale the final int32 accumulator to the scale used by our
+        // (16-bit, typically 3 integer bits) fixed-point format. The quantized
+        // multiplier and shift here have been pre-computed offline
+        // (e.g. by toco).
+        acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+                                            -output_shift);
+        // Saturate, cast to int16, and store to output array.
+        acc = std::max(acc, output_activation_min);
+        acc = std::min(acc, output_activation_max);
+        output_ptr[c + i] = acc;
+      }
+    }
+  } else if (batches == 4) {
+    int16* output_ptr = output_data;
+    // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+    // so that just reinterpreting them as int8 values is equivalent to
+    // subtracting 128 from them, thus implementing for free the subtraction of
+    // the zero_point value 128.
+    const int8* shuffled_weights_ptr =
+        reinterpret_cast<const int8*>(shuffled_weights_data);
+    // Likewise, we preshuffled and pre-xored the input data above.
+    const int8* shuffled_input_data =
+        reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+    for (int c = 0; c < output_depth; c += 4) {
+      const int8* shuffled_input_ptr = shuffled_input_data;
+      // Accumulation loop.
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32 accum[4][4];
+      for (int i = 0; i < 4; i++) {
+        for (int b = 0; b < 4; b++) {
+          accum[i][b] = 0;
+        }
+      }
+      for (int d = 0; d < accum_depth; d += 16) {
+        for (int i = 0; i < 4; i++) {
+          for (int b = 0; b < 4; b++) {
+            for (int j = 0; j < 16; j++) {
+              int8 input_val = shuffled_input_ptr[16 * b + j];
+              int8 weights_val = shuffled_weights_ptr[16 * i + j];
+              accum[i][b] += weights_val * input_val;
+            }
+          }
+        }
+        shuffled_input_ptr += 64;
+        shuffled_weights_ptr += 64;
+      }
+      for (int i = 0; i < 4; i++) {
+        for (int b = 0; b < 4; b++) {
+          // Add bias value
+          int acc = accum[i][b] + bias_data[c + i];
+          // Down-scale the final int32 accumulator to the scale used by our
+          // (16-bit, typically 3 integer bits) fixed-point format. The
+          // quantized multiplier and shift here have been pre-computed offline
+          // (e.g. by toco).
+          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+                                              -output_shift);
+          // Saturate, cast to int16, and store to output array.
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_ptr[b * output_depth + c + i] = acc;
+        }
+      }
     }
+  } else {
+    TFLITE_DCHECK(false);
+    return;
   }
 }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc b/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc
index f098981a5c..c00cdcb944 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc
@@ -55,17 +55,26 @@ bool ExperimentalShuffleFCWeights::Run(Model* model, std::size_t op_index) {
   // Exit if, based on the known shapes, this FC op is not a GEMV.
   // The shuffling of FC weights is only useful to enable fast GEMV paths.
   const Shape& input_shape = input_array.shape();
-  for (int i = 0; i < input_shape.dimensions_count() - 1; i++) {
+  for (int i = 1; i < input_shape.dimensions_count() - 1; i++) {
     if (input_shape.dims(i) != 1) {
       // The input activations, shaped as a matrix, have multiple columns.
       // This FC op isn't a matrix*vector multiplication.
       AddMessageF(
           "Not applying experimental shuffling to the weights of %s because "
-          "it's not a matrix*vector product",
+          "the input shape is not 1D or 2D (possibly with additional inner "
+          "dimensions of size 1)",
           LogName(*op));
       return false;
     }
   }
+  if (input_shape.dims(0) != 1 && input_shape.dims(0) != 4) {
+    AddMessageF(
+        "Not applying experimental shuffling to the weights of %s because "
+        "the input shape's leading dimension, i.e. the 'batch size', is not "
+        "equal to 1 or 4",
+        LogName(*op));
+    return false;
+  }
   // Exit if the weights shape isn't an integral multiple of the shuffled
   // block shape, 4x16. We don't want to have to write code dealing with
   // odd sizes, that would go un-exercised at the moment as the models
@@ -129,6 +138,20 @@ bool ExperimentalShuffleFCWeights::Run(Model* model, std::size_t op_index) {
   fc_op->experimental_shuffled_weights = true;
   AddMessageF("Applied experimental shuffling to the weights of %s",
               LogName(*op));
+  // Add a second output array to this FC op, serving as a workspace to perform
+  // runtime shuffling/xoring of its input activations.
+  CHECK_EQ(fc_op->outputs.size(), 1);
+  const string& shuffled_input_workspace_array_name =
+      AvailableArrayName(*model, fc_op->inputs[0] + "_shuffled");
+  fc_op->outputs.push_back(shuffled_input_workspace_array_name);
+  auto& shuffled_input_workspace_array =
+      model->GetOrCreateArray(shuffled_input_workspace_array_name);
+  shuffled_input_workspace_array.data_type = input_array.data_type;
+  *shuffled_input_workspace_array.mutable_shape() = input_array.shape();
+  shuffled_input_workspace_array.GetOrCreateMinMax() = input_array.GetMinMax();
+  shuffled_input_workspace_array.GetOrCreateQuantizationParams() =
+      input_array.GetQuantizationParams();
+
   return true;
 }
 
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index cf2cbeedc7..5a341294db 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -1405,20 +1405,7 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
     }
     input_minmax.min = (qmin - mean_value) / std_value;
     input_minmax.max = (qmax - mean_value) / std_value;
-    if (input_array.minmax) {
-      if (input_array_proto.has_mean_value() ||
-          input_array_proto.has_std_value()) {
-        const double width = input_minmax.max - input_minmax.min;
-        const double kMinMaxAllowedDiff = 1e-6 * width;
-        CHECK(std::abs(input_minmax.min - input_array.minmax->min) <
-                  kMinMaxAllowedDiff &&
-              std::abs(input_minmax.max - input_array.minmax->max) <
-                  kMinMaxAllowedDiff)
-            << input_minmax.min << ", " << input_minmax.max
-            << " != " << input_array.minmax->min << ", "
-            << input_array.minmax->max;
-      }
-    } else {
+    if (!input_array.minmax) {
       input_array.GetOrCreateMinMax() = input_minmax;
     }
   }
-- 
GitLab


From 89ff74a7b25c01a511e84a805d3b2edf780142a6 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Mon, 23 Apr 2018 12:03:19 -0700
Subject: [PATCH 308/450] [XLA] Disallow conversion from StatusOr<T> to
 StatusOr<U> if T is not convertible to U.

PiperOrigin-RevId: 193962287
---
 tensorflow/compiler/xla/statusor.h | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/xla/statusor.h b/tensorflow/compiler/xla/statusor.h
index 641b5e9a6a..cccbce5fc8 100644
--- a/tensorflow/compiler/xla/statusor.h
+++ b/tensorflow/compiler/xla/statusor.h
@@ -113,17 +113,19 @@ class StatusOr : private internal_statusor::StatusOrData<T>,
   StatusOr& operator=(StatusOr&&) = default;
 
   // Conversion copy/move constructor, T must be convertible from U.
-  // TODO(b/62186717): These should not participate in overload resolution if U
-  // is not convertible to T.
-  template <typename U>
+  template <typename U, typename std::enable_if<
+                            std::is_convertible<U, T>::value>::type* = nullptr>
   StatusOr(const StatusOr<U>& other);
-  template <typename U>
+  template <typename U, typename std::enable_if<
+                            std::is_convertible<U, T>::value>::type* = nullptr>
   StatusOr(StatusOr<U>&& other);
 
   // Conversion copy/move assignment operator, T must be convertible from U.
-  template <typename U>
+  template <typename U, typename std::enable_if<
+                            std::is_convertible<U, T>::value>::type* = nullptr>
   StatusOr& operator=(const StatusOr<U>& other);
-  template <typename U>
+  template <typename U, typename std::enable_if<
+                            std::is_convertible<U, T>::value>::type* = nullptr>
   StatusOr& operator=(StatusOr<U>&& other);
 
   // Constructs a new StatusOr with the given value. After calling this
@@ -233,12 +235,14 @@ StatusOr<T>& StatusOr<T>::operator=(Status&& status) {
 }
 
 template <typename T>
-template <typename U>
+template <typename U,
+          typename std::enable_if<std::is_convertible<U, T>::value>::type*>
 inline StatusOr<T>::StatusOr(const StatusOr<U>& other)
     : Base(static_cast<const typename StatusOr<U>::Base&>(other)) {}
 
 template <typename T>
-template <typename U>
+template <typename U,
+          typename std::enable_if<std::is_convertible<U, T>::value>::type*>
 inline StatusOr<T>& StatusOr<T>::operator=(const StatusOr<U>& other) {
   if (other.ok())
     this->Assign(other.ValueOrDie());
@@ -248,12 +252,14 @@ inline StatusOr<T>& StatusOr<T>::operator=(const StatusOr<U>& other) {
 }
 
 template <typename T>
-template <typename U>
+template <typename U,
+          typename std::enable_if<std::is_convertible<U, T>::value>::type*>
 inline StatusOr<T>::StatusOr(StatusOr<U>&& other)
     : Base(static_cast<typename StatusOr<U>::Base&&>(other)) {}
 
 template <typename T>
-template <typename U>
+template <typename U,
+          typename std::enable_if<std::is_convertible<U, T>::value>::type*>
 inline StatusOr<T>& StatusOr<T>::operator=(StatusOr<U>&& other) {
   if (other.ok()) {
     this->Assign(std::move(other).ValueOrDie());
-- 
GitLab


From 4adc560844c4d769efdaeb5b67d5ace1e0df7b16 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 12:21:29 -0700
Subject: [PATCH 309/450] Rewrite tail recursion in loop optimizer as loop to
 avoid stack overflow.

PiperOrigin-RevId: 193965038
---
 .../grappler/optimizers/loop_optimizer.cc     | 70 +++++++++++--------
 1 file changed, 39 insertions(+), 31 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index fff06dd2ac..f7994221bb 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -320,42 +320,50 @@ Status LoopInvariantNodeMotionOptimizer::RevertInvariantNodes() {
   return Status::OK();
 }
 
-Status LoopInvariantNodeMotionOptimizer::FindInvariantNodes(NodeDef* node) {
-  auto consumers = node_map_->GetOutputs(node->name());
-  invariant_nodes_.insert(std::make_pair(node, consumers.size()));
-  for (auto* consumer : consumers) {
-    if (invariant_nodes_.count(consumer) || ModifiesFrameInfo(*consumer)) {
-      continue;
-    }
-    bool is_invariant = true;
-    for (const auto& input : consumer->input()) {
-      if (!IsControlInput(input)) {
-        const string name = NodeName(input);
-        auto* producer = node_map_->GetNode(name);
-        if (!invariant_nodes_.count(producer)) {
-          if (IsConstant(*producer)) {
-            invariant_nodes_.insert(
-                std::make_pair(producer, node_map_->GetOutputs(name).size()));
-          } else {
-            is_invariant = false;
-            break;
-          }
-        }
+Status LoopInvariantNodeMotionOptimizer::FindInvariantNodes(
+    NodeDef* start_node) {
+  std::vector<NodeDef*> stack;
+  stack.reserve(32);
+  stack.push_back(start_node);
+  while (!stack.empty()) {
+    NodeDef* node = stack.back();
+    stack.pop_back();
+    auto consumers = node_map_->GetOutputs(node->name());
+    invariant_nodes_.emplace(node, consumers.size());
+    for (auto* consumer : consumers) {
+      if (invariant_nodes_.count(consumer) || ModifiesFrameInfo(*consumer)) {
+        continue;
       }
-    }
-    if (is_invariant) {
-      std::set<NodeDef*> producers;
+      bool is_invariant = true;
       for (const auto& input : consumer->input()) {
-        auto* producer = node_map_->GetNode(input);
-        producers.insert(producer);
+        if (!IsControlInput(input)) {
+          const string name = NodeName(input);
+          auto* producer = node_map_->GetNode(name);
+          if (!invariant_nodes_.count(producer)) {
+            if (IsConstant(*producer)) {
+              invariant_nodes_.insert(
+                  std::make_pair(producer, node_map_->GetOutputs(name).size()));
+            } else {
+              is_invariant = false;
+              break;
+            }
+          }
+        }
       }
-      for (auto* producer : producers) {
-        auto iter = invariant_nodes_.find(producer);
-        if (iter != invariant_nodes_.end()) {
-          --iter->second;
+      if (is_invariant) {
+        std::set<NodeDef*> producers;
+        for (const auto& input : consumer->input()) {
+          auto* producer = node_map_->GetNode(input);
+          producers.insert(producer);
+        }
+        for (auto* producer : producers) {
+          auto iter = invariant_nodes_.find(producer);
+          if (iter != invariant_nodes_.end()) {
+            --iter->second;
+          }
         }
+        stack.push_back(consumer);
       }
-      TF_RETURN_IF_ERROR(FindInvariantNodes(consumer));
     }
   }
   return Status::OK();
-- 
GitLab


From 7de04c4cd9fb6a38b1b34d02fed14c89057bf002 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Mon, 23 Apr 2018 12:21:57 -0700
Subject: [PATCH 310/450] Add TensorFlow format support to
 tf.keras.Model.save_weights and load_weights

Supports restore-on-create in subclassed Models when executing eagerly, and removes the requirement that the Model be built before weights are loaded.

Currently only subclassed Models work with the TensorFlow weight format. Graph networks will need a bit of extra logic to support the same topology/by-name distinction as the current HDF5 format (and for now they don't even add Checkpointable dependencies on their sub-layers).

Some notes:
  - Checkpoints won't be numbered. This keeps behavior the same as for existing HDF5 weight saving.
  - All dependencies will be saved for subclassed Models, not just layers. This will make it more useful for training checkpoints (you can assign an optimizer to an attribute and save the slot variables that way).
  - Subclassed models won't support loading by flattened weight list from the TensorFlow format. Since there's no global naming for Layers (it's local to the Model), I think this is reasonable.

PiperOrigin-RevId: 193965120
---
 .../keras/_impl/keras/engine/base_layer.py    |   9 +
 .../keras/_impl/keras/engine/network.py       | 208 +++++++++++++---
 .../keras/_impl/keras/engine/saving_test.py   | 227 +++++++++++++++++-
 .../keras/_impl/keras/engine/training.py      |   3 +
 .../_impl/keras/model_subclassing_test.py     |  29 ++-
 .../python/training/checkpointable_utils.py   |  12 +-
 .../api/golden/tensorflow.keras.-model.pbtxt  |   2 +-
 .../golden/tensorflow.keras.-sequential.pbtxt |   2 +-
 .../tensorflow.keras.models.-model.pbtxt      |   2 +-
 .../tensorflow.keras.models.-sequential.pbtxt |   2 +-
 tensorflow/tools/ci_build/ci_sanity.sh        |   1 +
 11 files changed, 438 insertions(+), 59 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/engine/base_layer.py b/tensorflow/python/keras/_impl/keras/engine/base_layer.py
index 6c68d25127..abae6c3785 100644
--- a/tensorflow/python/keras/_impl/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/_impl/keras/engine/base_layer.py
@@ -726,8 +726,17 @@ class Layer(checkpointable.CheckpointableBase):
     if hasattr(self, '_initial_weights') and self._initial_weights is not None:
       self.set_weights(self._initial_weights)
       del self._initial_weights
+    self._post_build_cleanup()
     return outputs
 
+  def _post_build_cleanup(self):
+    """Hooks to run after all sub-Layers are built."""
+    # Note that in addition to Layer.__call__, this method is called by Model
+    # after building a graph network (which skips __call__). It should be called
+    # when possible if self.built may have switched from False to True, and is
+    # idempotent.
+    pass  # No-op for Layers which don't override this method.
+
   def apply(self, inputs, *args, **kwargs):
     """Apply the layer on a input.
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/network.py b/tensorflow/python/keras/_impl/keras/engine/network.py
index 3b419dff3a..4127c781eb 100644
--- a/tensorflow/python/keras/_impl/keras/engine/network.py
+++ b/tensorflow/python/keras/_impl/keras/engine/network.py
@@ -22,11 +22,14 @@ from __future__ import print_function
 import copy
 import json
 import os
+import weakref
 
 import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
@@ -37,6 +40,7 @@ from tensorflow.python.keras._impl.keras.utils.io_utils import ask_to_proceed_wi
 from tensorflow.python.keras._impl.keras.utils.layer_utils import print_summary as print_layer_summary
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpointable
+from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
@@ -114,6 +118,13 @@ class Network(base_layer.Layer):
     self._outbound_nodes = []
     self._inbound_nodes = []
 
+    self._checkpointable_saver = checkpointable_utils.CheckpointableSaver(
+        weakref.ref(self))
+    # A zero-argument function which should be called and set back to None as
+    # soon as the network is built (only applicable to subclassed Models). Runs
+    # restore operations when graph building.
+    self._in_progress_restore_finalizer = None
+
   def _init_graph_network(self, inputs, outputs, name=None):
     self._uses_inputs_arg = True
     # Normalize and set self.inputs, self.outputs.
@@ -1125,62 +1136,179 @@ class Network(base_layer.Layer):
     from tensorflow.python.keras._impl.keras.models import save_model  # pylint: disable=g-import-not-at-top
     save_model(self, filepath, overwrite, include_optimizer)
 
-  def save_weights(self, filepath, overwrite=True):
-    """Dumps all layer weights to a HDF5 file.
+  def save_weights(self, filepath, overwrite=True, save_format=None):
+    """Saves all layer weights.
+
+    Either saves in HDF5 or in TensorFlow format based on the `save_format`
+    argument.
+
+    When saving in HDF5 format, the weight file has:
+      - `layer_names` (attribute), a list of strings
+          (ordered names of model layers).
+      - For every layer, a `group` named `layer.name`
+          - For every such layer group, a group attribute `weight_names`,
+              a list of strings
+              (ordered names of weights tensor of the layer).
+          - For every weight in the layer, a dataset
+              storing the weight value, named after the weight tensor.
 
-    The weight file has:
-        - `layer_names` (attribute), a list of strings
-            (ordered names of model layers).
-        - For every layer, a `group` named `layer.name`
-            - For every such layer group, a group attribute `weight_names`,
-                a list of strings
-                (ordered names of weights tensor of the layer).
-            - For every weight in the layer, a dataset
-                storing the weight value, named after the weight tensor.
+    Currently the TensorFlow format is only supported for user-defined classes
+    inheriting from `tf.keras.Model`, and not for networks constructed from
+    inputs and outputs (using `tf.keras.Model(inputs, outputs)`).
+
+    When saving in TensorFlow format, all objects referenced by the network are
+    saved in the same format as `tf.train.Checkpoint`, including any `Layer`s or
+    `Optimizer`s assigned to attributes in the constructor. See
+    `tf.train.Checkpoint`'s documentation for details.
 
     Arguments:
-        filepath: String, path to the file to save the weights to.
+        filepath: String, path to the file to save the weights to. When saving
+            in TensorFlow format, this is the prefix used for checkpoint files
+            (multiple files are generated). Note that the '.h5' suffix causes
+            weights to be saved in HDF5 format.
         overwrite: Whether to silently overwrite any existing file at the
             target location, or provide the user with a manual prompt.
+        save_format: Either 'tf' or 'h5'. If `None`, defaults to 'tf' for
+            user-defined classes inheriting from `tf.keras.Model` and 'h5' for
+            networks constructed from inputs and outputs. `filepath`s ending in
+            '.h5' or '.keras' always default to HDF5. Currently only 'h5' is
+            supported for networks constructed from inputs and outputs. Once
+            supported, the default for all networks will switch to 'tf'.
 
     Raises:
-        ImportError: If h5py is not available.
+        ImportError: If h5py is not available when attempting to save in HDF5
+            format.
+        ValueError: For invalid/unknown format arguments.
     """
-    if h5py is None:
-      raise ImportError('`save_weights` requires h5py.')
+    filepath_is_h5 = filepath.endswith('.h5') or filepath.endswith('.keras')
+    if save_format is None:
+      if filepath_is_h5:
+        save_format = 'h5'
+      else:
+        if self._is_graph_network:
+          # TODO(allenl): Handle loading by weight index and fix dependencies,
+          # then enable 'tensorflow' format by default for graph networks.
+          save_format = 'h5'
+        else:
+          # Subclassed models save in TensorFlow format by default.
+          save_format = 'tf'
+    else:
+      user_format = save_format.lower().strip()
+      if user_format in ('tensorflow', 'tf'):
+        save_format = 'tf'
+      elif user_format in ('hdf5', 'h5', 'keras'):
+        save_format = 'h5'
+      else:
+        raise ValueError(
+            'Unknown format "%s". Was expecting one of {"tf", "h5"}.' % (
+                save_format,))
+    if save_format == 'tf' and filepath_is_h5:
+      raise ValueError(
+          ('save_weights got save_format="tf"/"tensorflow", but the '
+           'filepath ("%s") looks like an HDF5 file. Omit the ".h5"/".keras" '
+           'when saving in TensorFlow format.')
+          % filepath)
+
+    if save_format == 'h5' and h5py is None:
+      raise ImportError(
+          '`save_weights` requires h5py when saving in hdf5.')
+    if save_format == 'tf':
+      if self._is_graph_network:
+        raise NotImplementedError(
+            'Networks constructed from inputs and outputs do not yet support '
+            'saving weights in the TensorFlow ("tf") save_format.')
+      check_filepath = filepath + '.index'
+    else:
+      check_filepath = filepath
     # If file exists and should not be overwritten:
-    if not overwrite and os.path.isfile(filepath):
-      proceed = ask_to_proceed_with_overwrite(filepath)
+    if not overwrite and os.path.isfile(check_filepath):
+      proceed = ask_to_proceed_with_overwrite(check_filepath)
       if not proceed:
         return
-    with h5py.File(filepath, 'w') as f:
-      saving.save_weights_to_hdf5_group(f, self.layers)
+    if save_format == 'h5':
+      with h5py.File(filepath, 'w') as f:
+        saving.save_weights_to_hdf5_group(f, self.layers)
+    else:
+      self._checkpointable_saver.save(filepath)
 
   def load_weights(self, filepath, by_name=False):
-    """Loads all layer weights from a HDF5 save file.
-
-    If `by_name` is False (default) weights are loaded
-    based on the network's topology, meaning the architecture
-    should be the same as when the weights were saved.
-    Note that layers that don't have weights are not taken
-    into account in the topological ordering, so adding or
-    removing layers is fine as long as they don't have weights.
-
-    If `by_name` is True, weights are loaded into layers
-    only if they share the same name. This is useful
-    for fine-tuning or transfer-learning models where
+    """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
+
+    If `by_name` is False weights are loaded based on the network's
+    topology. This means the architecture should be the same as when the weights
+    were saved.  Note that layers that don't have weights are not taken into
+    account in the topological ordering, so adding or removing layers is fine as
+    long as they don't have weights.
+
+    If `by_name` is True, weights are loaded into layers only if they share the
+    same name. This is useful for fine-tuning or transfer-learning models where
     some of the layers have changed.
 
+    Only topological loading (`by_name=False`) is supported when loading weights
+    from the TensorFlow format. Note that topological loading differs slightly
+    between TensorFlow and HDF5 formats for user-defined classes inheriting from
+    `tf.keras.Model`: HDF5 loads based on a flattened list of weights, while the
+    TensorFlow format loads based on the object-local names of attributes to
+    which layers are assigned in the `Model`'s constructor.
+
     Arguments:
-        filepath: String, path to the weights file to load.
-        by_name: Boolean, whether to load weights by name
-            or by topological order.
+        filepath: String, path to the weights file to load. For weight files in
+            TensorFlow format, this is the file prefix (the same as was passed
+            to `save_weights`).
+        by_name: Boolean, whether to load weights by name or by topological
+            order. Only topological loading is supported for weight files in
+            TensorFlow format.
+
+    Returns:
+        When loading a weight file in TensorFlow format, returns the same status
+        object as `tf.train.Checkpoint.restore`. When graph building, restore
+        ops are run automatically as soon as the network is built (on first call
+        for user-defined classes inheriting from `Model`, immediately if it is
+        already built).
+
+        When loading weights in HDF5 format, returns `None`.
 
     Raises:
-        ImportError: If h5py is not available.
+        ImportError: If h5py is not available and the weight file is in HDF5
+            format.
     """
+    if self._is_graph_network:
+      # Graph networks do not currently support TensorFlow formatted weight
+      # files.
+      save_format = 'h5'
+    else:
+      save_format = None
+    if save_format is None:
+      try:
+        pywrap_tensorflow.NewCheckpointReader(filepath)
+        save_format = 'tf'
+      except errors_impl.DataLossError:
+        # The checkpoint is not readable in TensorFlow format. Try HDF5.
+        save_format = 'h5'
+    if save_format == 'tf':
+      status = self._checkpointable_saver.restore(filepath)
+      if by_name:
+        raise NotImplementedError(
+            'Weights may only be loaded based on topology into Models when '
+            'loading TensorFlow-formatted weights (got by_name=True to '
+            'load_weights).')
+      if not context.executing_eagerly():
+        finalizer = status.run_restore_ops
+        if self.built:
+          finalizer()
+        else:
+          # Hold on to this status object until the network is built (for
+          # subclassed Models). Then we'll run restore ops if necessary.
+          self._in_progress_restore_finalizer = finalizer
+      return status
     if h5py is None:
-      raise ImportError('`load_weights` requires h5py.')
+      raise ImportError(
+          '`load_weights` requires h5py when loading weights from HDF5.')
+    if self._is_graph_network and not self.built:
+      raise NotImplementedError(
+          'Unable to load weights saved in HDF5 format into a subclassed '
+          'Model which has not created its variables yet. Call the Model '
+          'first, then load the weights.')
     with h5py.File(filepath, 'r') as f:
       if 'layer_names' not in f.attrs and 'model_weights' in f:
         f = f['model_weights']
@@ -1189,6 +1317,14 @@ class Network(base_layer.Layer):
       else:
         saving.load_weights_from_hdf5_group(f, self.layers)
 
+  def _post_build_cleanup(self):
+    super(Network, self)._post_build_cleanup()
+    if self._in_progress_restore_finalizer is not None:
+      # Runs queued restore operations left over from load_weights when graph
+      # building.
+      self._in_progress_restore_finalizer()
+      self._in_progress_restore_finalizer = None
+
   def _updated_config(self):
     """Util shared between different serialization methods.
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/saving_test.py b/tensorflow/python/keras/_impl/keras/engine/saving_test.py
index 3b1578cddf..8764ae5e9c 100644
--- a/tensorflow/python/keras/_impl/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/saving_test.py
@@ -24,7 +24,15 @@ import tempfile
 
 import numpy as np
 
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl import keras
+from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import training as training_module
 
@@ -55,12 +63,16 @@ class TestWeightSavingAndLoading(test.TestCase):
       with self.assertRaises(ValueError):
         model.set_weights(weights[::-1])
 
-      if h5py is None:
-        return  # Skip rest of test if H5py isn't available.
-
       temp_dir = self.get_temp_dir()
       self.addCleanup(shutil.rmtree, temp_dir)
 
+      no_extension_path = os.path.join(temp_dir, 'test')
+      with self.assertRaises(NotImplementedError):
+        model.save_weights(no_extension_path, save_format='tensorflow')
+
+      if h5py is None:
+        return  # Skip rest of test if H5py isn't available.
+
       h5_path = os.path.join(temp_dir, 'test.h5')
       model.save_weights(h5_path)
       model.load_weights(h5_path)
@@ -71,6 +83,16 @@ class TestWeightSavingAndLoading(test.TestCase):
       y = model.predict(x)
       self.assertAllClose(ref_y, y)
 
+      model.save_weights(no_extension_path)
+      model.load_weights(no_extension_path)
+      y = model.predict(x)
+      self.assertAllClose(ref_y, y)
+
+      model.save_weights(no_extension_path, save_format='hdf5')
+      model.load_weights(no_extension_path)
+      y = model.predict(x)
+      self.assertAllClose(ref_y, y)
+
   def test_weight_preprocessing(self):
     input_dim = 3
     output_dim = 3
@@ -457,5 +479,204 @@ class TestWholeModelSaving(test.TestCase):
       os.remove(fname)
 
 
+class SubclassedModel(training.Model):
+
+  def __init__(self):
+    super(SubclassedModel, self).__init__()
+    self.x_layer = keras.layers.Dense(3)
+    self.b_layer = keras.layers.Dense(1)
+
+  def call(self, a):
+    return self.b_layer(self.x_layer(a))
+
+
+# TODO(allenl): The graph model tests in this TestCase are still saving in
+# hdf5. Get them to save in tensorflow format.
+class TestWeightSavingAndLoadingTFFormat(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_tensorflow_format_overwrite(self):
+    with self.test_session() as session:
+      model = SubclassedModel()
+      temp_dir = self.get_temp_dir()
+      prefix = os.path.join(temp_dir, 'ckpt')
+
+      x = constant_op.constant(np.random.random((3, 2)), dtype=dtypes.float32)
+      executing_eagerly = context.executing_eagerly()
+      model(x)  # pylint: disable=not-callable
+      if not executing_eagerly:
+        session.run([v.initializer for v in model.variables])
+      model.save_weights(prefix, save_format='tensorflow')
+      model.save_weights(prefix, save_format='tensorflow', overwrite=True)
+      with self.assertRaises(EOFError):
+        # Indirectly tests that the user is prompted
+        model.save_weights(prefix, save_format='tensorflow', overwrite=False)
+
+  def test_no_graph_pollution(self):
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.test_session(graph) as session:
+        model = SubclassedModel()
+        temp_dir = self.get_temp_dir()
+        prefix = os.path.join(temp_dir, 'ckpt')
+
+        x = constant_op.constant(np.random.random((3, 2)), dtype=dtypes.float32)
+        model(x)  # pylint: disable=not-callable
+        session.run([v.initializer for v in model.variables])
+        model.save_weights(prefix, save_format='tensorflow')
+        op_count = len(graph.get_operations())
+        model.save_weights(prefix, save_format='tensorflow')
+        self.assertEqual(len(graph.get_operations()), op_count)
+
+        model.load_weights(prefix)
+        op_count = len(graph.get_operations())
+        model.load_weights(prefix)
+        self.assertEqual(len(graph.get_operations()), op_count)
+
+  def _weight_loading_test_template(self, make_model_fn):
+    with self.test_session() as session:
+      model = make_model_fn()
+      temp_dir = self.get_temp_dir()
+      prefix = os.path.join(temp_dir, 'ckpt')
+
+      x = constant_op.constant(np.random.random((3, 2)), dtype=dtypes.float32)
+      executing_eagerly = context.executing_eagerly()
+      ref_y_tensor = model(x)
+      if not executing_eagerly:
+        session.run([v.initializer for v in model.variables])
+      ref_y = self.evaluate(ref_y_tensor)
+      model.save_weights(prefix)
+      for v in model.variables:
+        self.evaluate(
+            v.assign(random_ops.random_normal(shape=array_ops.shape(v))))
+
+      self.addCleanup(shutil.rmtree, temp_dir)
+
+      model.load_weights(prefix)
+      y = self.evaluate(model(x))
+      self.assertAllClose(ref_y, y)
+
+      # Test restore-on-create if this is a subclassed Model (graph Networks
+      # will have already created their variables).
+      load_model = make_model_fn()
+      load_model.load_weights(prefix)
+      restore_on_create_y_tensor = load_model(x)
+      restore_on_create_y = self.evaluate(restore_on_create_y_tensor)
+      self.assertAllClose(ref_y, restore_on_create_y)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_weight_loading_graph_model(self):
+    def _make_graph_model():
+      a = keras.layers.Input(shape=(2,))
+      x = keras.layers.Dense(3)(a)
+      b = keras.layers.Dense(1)(x)
+      return keras.models.Model(a, b)
+
+    if h5py is None:
+      self.skipTest('This test only works with h5py.')
+
+    self._weight_loading_test_template(_make_graph_model)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_weight_loading_subclassed_model(self):
+    self._weight_loading_test_template(SubclassedModel)
+
+  def _new_layer_weight_loading_test_template(
+      self, first_model_fn, second_model_fn, restore_init_fn, by_name):
+    with self.test_session() as session:
+      model = first_model_fn()
+      temp_dir = self.get_temp_dir()
+      prefix = os.path.join(temp_dir, 'ckpt')
+
+      x = constant_op.constant(np.random.random((3, 2)), dtype=dtypes.float32)
+      executing_eagerly = context.executing_eagerly()
+      ref_y_tensor = model(x)
+      if not executing_eagerly:
+        session.run([v.initializer for v in model.variables])
+      ref_y = self.evaluate(ref_y_tensor)
+      model.save_weights(prefix)
+      for v in model.variables:
+        self.evaluate(
+            v.assign(random_ops.random_normal(shape=array_ops.shape(v))))
+
+      self.addCleanup(shutil.rmtree, temp_dir)
+
+      second_model = second_model_fn()
+      second_model.load_weights(prefix, by_name=by_name)
+      second_model(x)
+      self.evaluate(restore_init_fn(second_model))
+      second_model.save_weights(prefix)
+      # Check that the second model's checkpoint loads into the original model
+      model.load_weights(prefix, by_name=by_name)
+      y = self.evaluate(model(x))
+      self.assertAllClose(ref_y, y)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_weight_loading_graph_model_added_layer(self):
+    def _save_graph_model():
+      a = keras.layers.Input(shape=(2,))
+      x = keras.layers.Dense(3, name='first')(a)
+      b = keras.layers.Dense(1, name='second')(x)
+      return keras.models.Model(a, b)
+    def _restore_graph_model():
+      a = keras.layers.Input(shape=(2,))
+      x = keras.layers.Dense(3, name='first')(a)
+      y = keras.layers.Dense(1, name='second')(x)
+      b = keras.layers.Dense(3, name='secondjr')(y)
+      return keras.models.Model(a, b)
+    def _restore_init_fn(restore_model):
+      return [v.initializer for v in restore_model.layers[-1].variables]
+
+    if h5py is None:
+      self.skipTest('This test only works with h5py.')
+
+    self._new_layer_weight_loading_test_template(
+        _save_graph_model, _restore_graph_model,
+        _restore_init_fn, by_name=True)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_weight_loading_graph_model_added_no_weight_layer(self):
+    def _save_graph_model():
+      a = keras.layers.Input(shape=(2,))
+      x = keras.layers.Dense(3, name='first')(a)
+      b = keras.layers.Dense(1, name='second')(x)
+      return keras.models.Model(a, b)
+    def _restore_graph_model():
+      a = keras.layers.Input(shape=(2,))
+      x = keras.layers.Dense(3, name='first')(a)
+      y = keras.layers.Dropout(rate=0.1)(x)
+      b = keras.layers.Dense(1, name='second')(y)
+      return keras.models.Model(a, b)
+    def _restore_init_fn(restore_model):
+      del restore_model  # unused
+      return []
+    if h5py is None:
+      self.skipTest('This test only works with h5py.')
+
+    self._new_layer_weight_loading_test_template(
+        _save_graph_model, _restore_graph_model,
+        _restore_init_fn, by_name=False)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_weight_loading_subclassed_model_added_layer(self):
+
+    class SubclassedModelRestore(training.Model):
+
+      def __init__(self):
+        super(SubclassedModelRestore, self).__init__()
+        self.x_layer = keras.layers.Dense(3)
+        self.y_layer = keras.layers.Dense(3)
+        self.b_layer = keras.layers.Dense(1)
+
+      def call(self, a):
+        return self.b_layer(self.y_layer(self.x_layer(a)))
+
+    def _restore_init_fn(restore_model):
+      return [v.initializer for v in restore_model.y_layer.variables]
+
+    self._new_layer_weight_loading_test_template(
+        SubclassedModel, SubclassedModelRestore,
+        _restore_init_fn, by_name=False)
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index 146e8fdac9..5f9b3e8c7d 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -584,6 +584,7 @@ class Model(Network):
             updates=updates,
             name='train_function',
             **self._function_kwargs)
+    self._post_build_cleanup()
 
   def _make_test_function(self):
     if not hasattr(self, 'test_function'):
@@ -601,6 +602,7 @@ class Model(Network):
           updates=self.state_updates + self.metrics_updates,
           name='test_function',
           **self._function_kwargs)
+    self._post_build_cleanup()
 
   def _make_predict_function(self):
     if not hasattr(self, 'predict_function'):
@@ -619,6 +621,7 @@ class Model(Network):
           updates=self.state_updates,
           name='predict_function',
           **kwargs)
+    self._post_build_cleanup()
 
   def _standardize_user_data(self,
                              x,
diff --git a/tensorflow/python/keras/_impl/keras/model_subclassing_test.py b/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
index bc8698f235..295ad47f6b 100644
--- a/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import tempfile
 
 import numpy as np
 import six
@@ -420,8 +419,6 @@ class ModelSubclassingTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes()
   def test_saving(self):
-    if h5py is None:
-      return  # Skip test if models cannot be saved.
 
     num_classes = (2, 3)
     num_samples = 100
@@ -437,20 +434,30 @@ class ModelSubclassingTest(test.TestCase):
     model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
     y_ref_1, y_ref_2 = model.predict([x1, x2])
 
-    fd, fname = tempfile.mkstemp('.h5')
-    model.save_weights(fname)
+    tf_format_name = os.path.join(self.get_temp_dir(), 'ckpt')
+    model.save_weights(tf_format_name)
+    if h5py is not None:
+      hdf5_format_name = os.path.join(self.get_temp_dir(), 'weights.h5')
+      model.save_weights(hdf5_format_name)
 
     model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-    # need to build the model before loading weights
-    # (otherwise no weights to load)
-    model._set_inputs([x1, x2])
-    model.load_weights(fname)
+
+    if h5py is not None:
+      with self.assertRaises(ValueError):
+        model.load_weights(hdf5_format_name)
+
+    model.load_weights(tf_format_name)
 
     y1, y2 = model.predict([x1, x2])
     self.assertAllClose(y_ref_1, y1, atol=1e-5)
     self.assertAllClose(y_ref_2, y2, atol=1e-5)
-    os.close(fd)
-    os.remove(fname)
+
+    if h5py is not None:
+      model.load_weights(hdf5_format_name)
+
+      y1, y2 = model.predict([x1, x2])
+      self.assertAllClose(y_ref_1, y1, atol=1e-5)
+      self.assertAllClose(y_ref_2, y2, atol=1e-5)
 
   @test_util.run_in_graph_and_eager_modes()
   def test_summary(self):
diff --git a/tensorflow/python/training/checkpointable_utils.py b/tensorflow/python/training/checkpointable_utils.py
index 4769e15120..13bd89d907 100644
--- a/tensorflow/python/training/checkpointable_utils.py
+++ b/tensorflow/python/training/checkpointable_utils.py
@@ -616,11 +616,10 @@ class CheckpointableSaver(object):
     # Allow passing in a weak reference to avoid reference cycles when
     # `Checkpointable` objects save themselves.
     self._root_checkpointable_ref = root_checkpointable
-    if not context.executing_eagerly():
-      with ops.device("/cpu:0"):
-        self._file_prefix_placeholder = constant_op.constant("model")
-    else:
-      self._file_prefix_placeholder = None
+    # The file prefix placeholder is created lazily when graph building (and not
+    # at all when executing eagerly) to avoid creating ops in the constructor
+    # (when they may never be necessary).
+    self._file_prefix_placeholder = None
 
     # Op caching for save
     self._object_graph_feed_tensor = None
@@ -778,6 +777,9 @@ class CheckpointableSaver(object):
       return InitializationOnlyStatus(self._root_checkpointable)
     in_graph_mode = not context.executing_eagerly()
     if in_graph_mode:
+      if self._file_prefix_placeholder is None:
+        with ops.device("/cpu:0"):
+          self._file_prefix_placeholder = constant_op.constant("model")
       file_prefix_tensor = self._file_prefix_placeholder
       file_prefix_feed_dict = {self._file_prefix_placeholder: save_path}
     else:
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
index cdf2da712f..cee76bdc1d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
@@ -239,7 +239,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
index 5c2c29e60f..02718cb5f9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
@@ -256,7 +256,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
index b3f3f16922..dd78384005 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
@@ -239,7 +239,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
index 4ac6811bac..9fcb03f47e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
@@ -256,7 +256,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 9627475d84..8e8b2191e5 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -101,6 +101,7 @@ do_pylint() {
 "^tensorflow/contrib/eager/python/metrics_impl\.py.*\[E0202.*method-hidden "\
 "^tensorflow/python/platform/gfile\.py.*\[E0301.*non-iterator "\
 "^tensorflow/python/keras/_impl/keras/callbacks\.py.*\[E1133.*not-an-iterable "\
+"^tensorflow/python/keras/_impl/keras/engine/base_layer.py.*\[E0203.*access-member-before-definition "\
 "^tensorflow/python/keras/_impl/keras/layers/recurrent\.py.*\[E0203.*access-member-before-definition "\
 "^tensorflow/python/kernel_tests/constant_op_eager_test.py.*\[E0303.*invalid-length-returned"
 
-- 
GitLab


From 06d5ca2ae097c08c886759dd27f90b19e4c6f49d Mon Sep 17 00:00:00 2001
From: Andy Kernahan <andrew.kernahan@gmail.com>
Date: Mon, 23 Apr 2018 20:32:35 +0100
Subject: [PATCH 311/450] Fix tfcompile module label. (#16582)

---
 tensorflow/docs_src/performance/xla/tfcompile.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/docs_src/performance/xla/tfcompile.md b/tensorflow/docs_src/performance/xla/tfcompile.md
index f57ca3948d..8521d7eacb 100644
--- a/tensorflow/docs_src/performance/xla/tfcompile.md
+++ b/tensorflow/docs_src/performance/xla/tfcompile.md
@@ -86,7 +86,7 @@ code. `tf_library` utilizes `tfcompile` to compile the TensorFlow graph into
 executable code.
 
 ```build
-load("//third_party/tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
+load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 
 # Use the tf_library macro to compile your graph into executable code.
 tf_library(
@@ -258,8 +258,8 @@ file.
 
 ```build
 # Example of linking your binary
-# Also see //third_party/tensorflow/compiler/aot/tests/BUILD
-load("//third_party/tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
+# Also see //tensorflow/compiler/aot/tests/BUILD
+load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 
 # The same tf_library call from step 2 above.
 tf_library(
-- 
GitLab


From d9191b881fc283d93a8eaa4961c5e16f2205311f Mon Sep 17 00:00:00 2001
From: Martin Wicke <wicke@google.com>
Date: Mon, 23 Apr 2018 12:35:35 -0700
Subject: [PATCH 312/450] Re-enable metrics_test, increase sharding.

PiperOrigin-RevId: 193967074
---
 tensorflow/python/kernel_tests/BUILD | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 8628ca5d40..ebbec39cf3 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -2877,11 +2877,8 @@ tf_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:variables",
     ],
-    shard_count = 10,
-    tags = [
-        "no_windows_gpu",
-        "noasan",
-    ],
+    shard_count = 20,
+    tags = ["no_windows_gpu"],
 )
 
 tf_py_test(
-- 
GitLab


From 594c1c60f523ba4dd45545876e850ca7281be73a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 13:12:58 -0700
Subject: [PATCH 313/450] Entropy bottleneck class.

PiperOrigin-RevId: 193972549
---
 tensorflow/contrib/BUILD                      |   2 +-
 tensorflow/contrib/cmake/python_modules.txt   |   1 +
 .../contrib/cmake/tf_core_kernels.cmake       |   1 +
 tensorflow/contrib/coder/BUILD                |  56 +-
 tensorflow/contrib/coder/__init__.py          |   3 +-
 .../coder/python/layers/entropybottleneck.py  | 697 ++++++++++++++++++
 .../python/layers/entropybottleneck_test.py   | 315 ++++++++
 7 files changed, 1071 insertions(+), 4 deletions(-)
 create mode 100644 tensorflow/contrib/coder/python/layers/entropybottleneck.py
 create mode 100644 tensorflow/contrib/coder/python/layers/entropybottleneck_test.py

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index d28392a62c..8edb8654b8 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -29,7 +29,7 @@ py_library(
         "//tensorflow/contrib/cloud:cloud_py",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
-        "//tensorflow/contrib/coder:coder_ops_py",
+        "//tensorflow/contrib/coder:coder_py",
         "//tensorflow/contrib/compiler:compiler_py",
         "//tensorflow/contrib/copy_graph:copy_graph_py",
         "//tensorflow/contrib/crf:crf_py",
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index fbcdf7e753..932a6eeeaa 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -144,6 +144,7 @@ tensorflow/contrib/coder
 tensorflow/contrib/coder/kernels
 tensorflow/contrib/coder/ops
 tensorflow/contrib/coder/python
+tensorflow/contrib/coder/python/layers
 tensorflow/contrib/coder/python/ops
 tensorflow/contrib/compiler
 tensorflow/contrib/copy_graph
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index ed018b4fed..376496b33f 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -63,6 +63,7 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/training_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops_util.cc"
diff --git a/tensorflow/contrib/coder/BUILD b/tensorflow/contrib/coder/BUILD
index 9ca4ce8a9c..a146460a9c 100644
--- a/tensorflow/contrib/coder/BUILD
+++ b/tensorflow/contrib/coder/BUILD
@@ -1,5 +1,5 @@
 # Description:
-#   Contains entropy coding related modules.
+#   Contains tools related to data compression.
 
 package(default_visibility = [
     "//learning/brain:__subpackages__",
@@ -152,10 +152,21 @@ tf_gen_op_wrapper_py(
     deps = [":coder_ops_op_lib"],
 )
 
+py_library(
+    name = "coder_py",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":coder_ops_py",
+        ":entropybottleneck_py",
+    ],
+)
+
 tf_custom_op_py_library(
     name = "coder_ops_py",
     srcs = [
-        "__init__.py",
         "python/ops/coder_ops.py",
     ],
     dso = [
@@ -186,3 +197,44 @@ tf_py_test(
     ],
     main = "python/ops/coder_ops_test.py",
 )
+
+py_library(
+    name = "entropybottleneck_py",
+    srcs = [
+        "python/layers/entropybottleneck.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":coder_ops_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras:engine",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "entropybottleneck_py_test",
+    srcs = [
+        "python/layers/entropybottleneck_test.py",
+    ],
+    additional_deps = [
+        ":entropybottleneck_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:variables",
+        "//tensorflow/python:training",
+    ],
+    main = "python/layers/entropybottleneck_test.py",
+)
diff --git a/tensorflow/contrib/coder/__init__.py b/tensorflow/contrib/coder/__init__.py
index b7e663e6f1..99b8ac7595 100644
--- a/tensorflow/contrib/coder/__init__.py
+++ b/tensorflow/contrib/coder/__init__.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Entropy code operations."""
+"""Data compression tools."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import
+from tensorflow.contrib.coder.python.layers.entropybottleneck import *
 from tensorflow.contrib.coder.python.ops.coder_ops import *
 # pylint: enable=wildcard-import
 
diff --git a/tensorflow/contrib/coder/python/layers/entropybottleneck.py b/tensorflow/contrib/coder/python/layers/entropybottleneck.py
new file mode 100644
index 0000000000..f039cb0f52
--- /dev/null
+++ b/tensorflow/contrib/coder/python/layers/entropybottleneck.py
@@ -0,0 +1,697 @@
+# -*- coding: utf-8 -*-
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Entropy bottleneck layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.coder.python.ops import coder_ops
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import engine
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
+
+
+class EntropyBottleneck(engine.Layer):
+  """Entropy bottleneck layer.
+
+  This layer can be used to model the entropy (the amount of information
+  conveyed) of the tensor passing through it. During training, this can be used
+  to impose a (soft) entropy constraint on its activations, limiting the amount
+  of information flowing through the layer. Note that this is distinct from
+  other types of bottlenecks, which reduce the dimensionality of the space, for
+  example. Dimensionality reduction does not limit the amount of information,
+  and does not enable efficient data compression per se.
+
+  After training, this layer can be used to compress any input tensor to a
+  string, which may be written to a file, and to decompress a file which it
+  previously generated back to a reconstructed tensor (possibly on a different
+  machine having access to the same model checkpoint). The entropies estimated
+  during training or evaluation are approximately equal to the average length of
+  the strings in bits.
+
+  The layer implements a flexible probability density model to estimate entropy,
+  which is described in the appendix of the paper (please cite the paper if you
+  use this code for scientific work):
+
+  "Variational image compression with a scale hyperprior"
+
+  Johannes Ballé, David Minnen, Saurabh Singh, Sung Jin Hwang, Nick Johnston
+
+  https://arxiv.org/abs/1802.01436
+
+  The layer assumes that the input tensor is at least 2D, with a batch dimension
+  at the beginning and a channel dimension as specified by `data_format`. The
+  layer trains an independent probability density model for each channel, but
+  assumes that across all other dimensions, the inputs are i.i.d. (independent
+  and identically distributed). Because the entropy (and hence, average
+  codelength) is a function of the densities, this assumption may have a direct
+  effect on the compression performance.
+
+  Because data compression always involves discretization, the outputs of the
+  layer are generally only approximations of its inputs. During training,
+  discretization is modeled using additive uniform noise to ensure
+  differentiability. The entropies computed during training are differential
+  entropies. During evaluation, the data is actually quantized, and the
+  entropies are discrete (Shannon entropies). To make sure the approximated
+  tensor values are good enough for practical purposes, the training phase must
+  be used to balance the quality of the approximation with the entropy, by
+  adding an entropy term to the training loss, as in the following example.
+
+  Here, we use the entropy bottleneck to compress the latent representation of
+  an autoencoder. The data vectors `x` in this case are 4D tensors in
+  `'channels_last'` format (for example, 16x16 pixel grayscale images).
+
+  The layer always produces exactly one auxiliary loss and one update op which
+  are only significant for compression and decompression. To use the compression
+  feature, the auxiliary loss must be minimized during or after training. After
+  that, the update op must be executed at least once. Here, we simply attach
+  them to the main training step.
+
+  Training:
+  ```
+  # Build autoencoder.
+  x = tf.placeholder(tf.float32, shape=[None, 16, 16, 1])
+  y = forward_transform(x)
+  entropy_bottleneck = EntropyBottleneck()
+  y_, likelihoods = entropy_bottleneck(y, training=True)
+  x_ = backward_transform(y_)
+
+  # Information content (= predicted codelength) in bits of each batch element
+  # (note that taking the natural logarithm and dividing by `log(2)` is
+  # equivalent to taking base-2 logarithms):
+  bits = tf.reduce_sum(tf.log(likelihoods), axis=(1, 2, 3)) / -np.log(2)
+
+  # Squared difference of each batch element:
+  squared_error = tf.reduce_sum(tf.squared_difference(x, x_), axis=(1, 2, 3))
+
+  # The loss is a weighted sum of mean squared error and entropy (average
+  # information content), where the weight controls the trade-off between
+  # approximation error and entropy.
+  main_loss = 0.5 * tf.reduce_mean(squared_error) + tf.reduce_mean(bits)
+
+  # Minimize loss and auxiliary loss, and execute update op.
+  main_optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)
+  main_step = optimizer.minimize(main_loss)
+  # 1e-2 is a good starting point for the learning rate of the auxiliary loss,
+  # assuming Adam is used.
+  aux_optimizer = tf.train.AdamOptimizer(learning_rate=1e-2)
+  aux_step = optimizer.minimize(entropy_bottleneck.losses[0])
+  step = tf.group(main_step, aux_step, entropy_bottleneck.updates[0])
+  ```
+
+  Evaluation:
+  ```
+  # Build autoencoder.
+  x = tf.placeholder(tf.float32, shape=[None, 16, 16, 1])
+  y = forward_transform(x)
+  y_, likelihoods = EntropyBottleneck()(y, training=False)
+  x_ = backward_transform(y_)
+
+  # Information content (= predicted codelength) in bits of each batch element:
+  bits = tf.reduce_sum(tf.log(likelihoods), axis=(1, 2, 3)) / -np.log(2)
+
+  # Squared difference of each batch element:
+  squared_error = tf.reduce_sum(tf.squared_difference(x, x_), axis=(1, 2, 3))
+
+  # The loss is a weighted sum of mean squared error and entropy (average
+  # information content), where the weight controls the trade-off between
+  # approximation error and entropy.
+  loss = 0.5 * tf.reduce_mean(squared_error) + tf.reduce_mean(bits)
+  ```
+
+  To be able to compress the bottleneck tensor and decompress it in a different
+  session, or on a different machine, you need three items:
+  - The compressed representations stored as strings.
+  - The shape of the bottleneck for these string representations as a `Tensor`,
+    as well as the number of channels of the bottleneck at graph construction
+    time.
+  - The checkpoint of the trained model that was used for compression. Note:
+    It is crucial that the auxiliary loss produced by this layer is minimized
+    during or after training, and that the update op is run after training and
+    minimization of the auxiliary loss, but *before* the checkpoint is saved.
+
+  Compression:
+  ```
+  x = tf.placeholder(tf.float32, shape=[None, 16, 16, 1])
+  y = forward_transform(x)
+  strings = EntropyBottleneck().compress(y)
+  shape = tf.shape(y)[1:]
+  ```
+
+  Decompression:
+  ```
+  strings = tf.placeholder(tf.string, shape=[None])
+  shape = tf.placeholder(tf.int32, shape=[3])
+  entropy_bottleneck = EntropyBottleneck(dtype=tf.float32)
+  y_ = entropy_bottleneck.decompress(strings, shape, channels=5)
+  x_ = backward_transform(y_)
+  ```
+  Here, we assumed that the tensor produced by the forward transform has 5
+  channels.
+
+  The above four use cases can also be implemented within the same session (i.e.
+  on the same `EntropyBottleneck` instance), for testing purposes, etc., by
+  calling the object more than once.
+
+  Arguments:
+    init_scale: Float. A scaling factor determining the initial width of the
+      probability densities. This should be chosen big enough so that the
+      range of values of the layer inputs roughly falls within the interval
+      [`-init_scale`, `init_scale`] at the beginning of training.
+    filters: An iterable of ints, giving the number of filters at each layer of
+      the density model. Generally, the more filters and layers, the more
+      expressive is the density model in terms of modeling more complicated
+      distributions of the layer inputs. For details, refer to the paper
+      referenced above. The default is `[3, 3, 3]`, which should be sufficient
+      for most practical purposes.
+    tail_mass: Float, between 0 and 1. The bottleneck layer automatically
+      determines the range of input values that should be represented based on
+      their frequency of occurrence. Values occurring in the tails of the
+      distributions will be clipped to that range during compression.
+      `tail_mass` determines the amount of probability mass in the tails which
+      is cut off in the worst case. For example, the default value of `1e-9`
+      means that at most 1 in a billion input samples will be clipped to the
+      range.
+    optimize_integer_offset: Boolean. Typically, the input values of this layer
+      are floats, which means that quantization during evaluation can be
+      performed with an arbitrary offset. By default, the layer determines that
+      offset automatically. In special situations, such as when it is known that
+      the layer will receive only full integer values during evaluation, it can
+      be desirable to set this argument to `False` instead, in order to always
+      quantize to full integer values.
+    likelihood_bound: Float. If positive, the returned likelihood values are
+      ensured to be greater than or equal to this value. This prevents very
+      large gradients with a typical entropy loss (defaults to 1e-9).
+    range_coder_precision: Integer, between 1 and 16. The precision of the range
+      coder used for compression and decompression. This trades off computation
+      speed with compression efficiency, where 16 is the slowest but most
+      efficient setting. Choosing lower values may increase the average
+      codelength slightly compared to the estimated entropies.
+    data_format: Either `'channels_first'` or `'channels_last'` (default).
+    trainable: Boolean. Whether the layer should be trained.
+    name: String. The name of the layer.
+    dtype: Default dtype of the layer's parameters (default of `None` means use
+      the type of the first input).
+
+  Read-only properties:
+    init_scale: See above.
+    filters: See above.
+    tail_mass: See above.
+    optimize_integer_offset: See above.
+    likelihood_bound: See above.
+    range_coder_precision: See above.
+    data_format: See above.
+    name: String. See above.
+    dtype: See above.
+    trainable_variables: List of trainable variables.
+    non_trainable_variables: List of non-trainable variables.
+    variables: List of all variables of this layer, trainable and non-trainable.
+    updates: List of update ops of this layer. Always contains exactly one
+      update op, which must be run once after the last training step, before
+      `compress` or `decompress` is used.
+    losses: List of losses added by this layer. Always contains exactly one
+      auxiliary loss, which must be added to the training loss.
+
+  Mutable properties:
+    trainable: Boolean. Whether the layer should be trained.
+    input_spec: Optional `InputSpec` object specifying the constraints on inputs
+      that can be accepted by the layer.
+  """
+
+  def __init__(self, init_scale=10, filters=(3, 3, 3), tail_mass=1e-9,
+               optimize_integer_offset=True, likelihood_bound=1e-9,
+               range_coder_precision=16, data_format="channels_last", **kwargs):
+    super(EntropyBottleneck, self).__init__(**kwargs)
+    self._init_scale = float(init_scale)
+    self._filters = tuple(int(f) for f in filters)
+    self._tail_mass = float(tail_mass)
+    if not 0 < self.tail_mass < 1:
+      raise ValueError(
+          "`tail_mass` must be between 0 and 1, got {}.".format(self.tail_mass))
+    self._optimize_integer_offset = bool(optimize_integer_offset)
+    self._likelihood_bound = float(likelihood_bound)
+    self._range_coder_precision = int(range_coder_precision)
+    self._data_format = data_format
+    self._channel_axis(2)  # trigger ValueError early
+    self.input_spec = engine.InputSpec(min_ndim=2)
+
+  @property
+  def init_scale(self):
+    return self._init_scale
+
+  @property
+  def filters(self):
+    return self._filters
+
+  @property
+  def tail_mass(self):
+    return self._tail_mass
+
+  @property
+  def optimize_integer_offset(self):
+    return self._optimize_integer_offset
+
+  @property
+  def likelihood_bound(self):
+    return self._likelihood_bound
+
+  @property
+  def range_coder_precision(self):
+    return self._range_coder_precision
+
+  @property
+  def data_format(self):
+    return self._data_format
+
+  def _channel_axis(self, ndim):
+    try:
+      return {"channels_first": 1, "channels_last": ndim - 1}[self.data_format]
+    except KeyError:
+      raise ValueError("Unsupported `data_format` for {} layer: {}.".format(
+          self.__class__.__name__, self.data_format))
+
+  def _logits_cumulative(self, inputs, stop_gradient):
+    """Evaluate logits of the cumulative densities.
+
+    Args:
+      inputs: The values at which to evaluate the cumulative densities, expected
+        to be a `Tensor` of shape `(channels, 1, batch)`.
+      stop_gradient: Boolean. Whether to add `array_ops.stop_gradient` calls so
+        that the gradient of the output with respect to the density model
+        parameters is disconnected (the gradient with respect to `inputs` is
+        left untouched).
+
+    Returns:
+      A `Tensor` of the same shape as `inputs`, containing the logits of the
+      cumulative densities evaluated at the given inputs.
+    """
+    logits = inputs
+
+    for i in range(len(self.filters) + 1):
+      matrix = self._matrices[i]
+      if stop_gradient:
+        matrix = array_ops.stop_gradient(matrix)
+      logits = math_ops.matmul(matrix, logits)
+
+      bias = self._biases[i]
+      if stop_gradient:
+        bias = array_ops.stop_gradient(bias)
+      logits += bias
+
+      if i < len(self._factors):
+        factor = self._factors[i]
+        if stop_gradient:
+          factor = array_ops.stop_gradient(factor)
+        logits += factor * math_ops.tanh(logits)
+
+    return logits
+
+  def build(self, input_shape):
+    """Builds the layer.
+
+    Creates the variables for the network modeling the densities, creates the
+    auxiliary loss estimating the median and tail quantiles of the densities,
+    and then uses that to create the probability mass functions and the update
+    op that produces the discrete cumulative density functions used by the range
+    coder.
+
+    Args:
+      input_shape: Shape of the input tensor, used to get the number of
+        channels.
+
+    Raises:
+      ValueError: if `input_shape` doesn't specify the length of the channel
+        dimension.
+    """
+    input_shape = tensor_shape.TensorShape(input_shape)
+    channel_axis = self._channel_axis(input_shape.ndims)
+    channels = input_shape[channel_axis].value
+    if channels is None:
+      raise ValueError("The channel dimension of the inputs must be defined.")
+    self.input_spec = engine.InputSpec(
+        ndim=input_shape.ndims, axes={channel_axis: channels})
+    filters = (1,) + self.filters + (1,)
+    scale = self.init_scale ** (1 / (len(self.filters) + 1))
+
+    # Create variables.
+    self._matrices = []
+    self._biases = []
+    self._factors = []
+    for i in range(len(self.filters) + 1):
+      init = np.log(np.expm1(1 / scale / filters[i + 1]))
+      matrix = self.add_variable(
+          "matrix_{}".format(i), dtype=self.dtype,
+          shape=(channels, filters[i + 1], filters[i]),
+          initializer=init_ops.Constant(init))
+      matrix = nn.softplus(matrix)
+      self._matrices.append(matrix)
+
+      bias = self.add_variable(
+          "bias_{}".format(i), dtype=self.dtype,
+          shape=(channels, filters[i + 1], 1),
+          initializer=init_ops.RandomUniform(-.5, .5))
+      self._biases.append(bias)
+
+      if i < len(self.filters):
+        factor = self.add_variable(
+            "factor_{}".format(i), dtype=self.dtype,
+            shape=(channels, filters[i + 1], 1),
+            initializer=init_ops.Zeros())
+        factor = math_ops.tanh(factor)
+        self._factors.append(factor)
+
+    # To figure out what range of the densities to sample, we need to compute
+    # the quantiles given by `tail_mass / 2` and `1 - tail_mass / 2`. Since we
+    # can't take inverses of the cumulative directly, we make it an optimization
+    # problem:
+    # `quantiles = argmin(|logit(cumulative) - target|)`
+    # where `target` is `logit(tail_mass / 2)` or `logit(1 - tail_mass / 2)`.
+    # Taking the logit (inverse of sigmoid) of the cumulative makes the
+    # representation of the right target more numerically stable.
+
+    # Numerically stable way of computing logits of `tail_mass / 2`
+    # and `1 - tail_mass / 2`.
+    target = np.log(2 / self.tail_mass - 1)
+    # Compute lower and upper tail quantile as well as median.
+    target = constant_op.constant([-target, 0, target], dtype=self.dtype)
+
+    def quantiles_initializer(shape, dtype=None, partition_info=None):
+      del partition_info  # unused
+      assert tuple(shape[1:]) == (1, 3)
+      init = constant_op.constant(
+          [[[-self.init_scale, 0, self.init_scale]]], dtype=dtype)
+      return array_ops.tile(init, (shape[0], 1, 1))
+
+    quantiles = self.add_variable(
+        "quantiles", shape=(channels, 1, 3), dtype=self.dtype,
+        initializer=quantiles_initializer)
+    logits = self._logits_cumulative(quantiles, stop_gradient=True)
+    loss = math_ops.reduce_sum(abs(logits - target))
+    self.add_loss(loss, inputs=None)
+
+    # Save medians for `call`, `compress`, and `decompress`.
+    self._medians = quantiles[:, :, 1:2]
+    if not self.optimize_integer_offset:
+      self._medians = math_ops.round(self._medians)
+
+    # Largest distance observed between lower tail quantile and median,
+    # or between median and upper tail quantile.
+    minima = math_ops.reduce_max(self._medians - quantiles[:, :, 0:1])
+    maxima = math_ops.reduce_max(quantiles[:, :, 2:3] - self._medians)
+    minmax = math_ops.maximum(minima, maxima)
+    minmax = math_ops.ceil(minmax)
+    minmax = math_ops.maximum(minmax, 1)
+
+    # Sample the density up to `minmax` around the median.
+    samples = math_ops.range(-minmax, minmax + 1, dtype=self.dtype)
+    samples += self._medians
+
+    half = constant_op.constant(.5, dtype=self.dtype)
+    # We strip the sigmoid from the end here, so we can use the special rule
+    # below to only compute differences in the left tail of the sigmoid.
+    # This increases numerical stability (see explanation in `call`).
+    lower = self._logits_cumulative(samples - half, stop_gradient=True)
+    upper = self._logits_cumulative(samples + half, stop_gradient=True)
+    # Flip signs if we can move more towards the left tail of the sigmoid.
+    sign = -math_ops.sign(math_ops.add_n([lower, upper]))
+    pmf = abs(math_ops.sigmoid(sign * upper) - math_ops.sigmoid(sign * lower))
+    # Add tail masses to first and last bin of pmf, as we clip values for
+    # compression, meaning that out-of-range values get mapped to these bins.
+    pmf = array_ops.concat([
+        math_ops.add_n([pmf[:, 0, :1], math_ops.sigmoid(lower[:, 0, :1])]),
+        pmf[:, 0, 1:-1],
+        math_ops.add_n([pmf[:, 0, -1:], math_ops.sigmoid(-upper[:, 0, -1:])]),
+        ], axis=-1)
+    self._pmf = pmf
+
+    cdf = coder_ops.pmf_to_quantized_cdf(
+        pmf, precision=self.range_coder_precision)
+    def cdf_getter(*args, **kwargs):
+      del args, kwargs  # ignored
+      return variable_scope.get_variable(
+          "quantized_cdf", dtype=dtypes.int32, initializer=cdf,
+          trainable=False, validate_shape=False, collections=())
+    # Need to provide a fake shape here since add_variable insists on it.
+    self._quantized_cdf = self.add_variable(
+        "quantized_cdf", shape=(channels, 1), dtype=dtypes.int32,
+        getter=cdf_getter, trainable=False)
+
+    update_op = state_ops.assign(
+        self._quantized_cdf, cdf, validate_shape=False)
+    self.add_update(update_op, inputs=None)
+
+    super(EntropyBottleneck, self).build(input_shape)
+
+  def call(self, inputs, training):
+    """Pass a tensor through the bottleneck.
+
+    Args:
+      inputs: The tensor to be passed through the bottleneck.
+      training: Boolean. If `True`, returns a differentiable approximation of
+        the inputs, and their likelihoods under the modeled probability
+        densities. If `False`, returns the quantized inputs and their
+        likelihoods under the corresponding probability mass function. These
+        quantities can't be used for training, as they are not differentiable,
+        but represent actual compression more closely.
+
+    Returns:
+      values: `Tensor` with the same shape as `inputs` containing the perturbed
+        or quantized input values.
+      likelihood: `Tensor` with the same shape as `inputs` containing the
+        likelihood of `values` under the modeled probability distributions.
+
+    Raises:
+      ValueError: if `inputs` has different `dtype` or number of channels than
+        a previous set of inputs the model was invoked with earlier.
+    """
+    inputs = ops.convert_to_tensor(inputs)
+    ndim = self.input_spec.ndim
+    channel_axis = self._channel_axis(ndim)
+    half = constant_op.constant(.5, dtype=self.dtype)
+
+    # Convert to (channels, 1, batch) format by commuting channels to front
+    # and then collapsing.
+    order = list(range(ndim))
+    order.pop(channel_axis)
+    order.insert(0, channel_axis)
+    values = array_ops.transpose(inputs, order)
+    shape = array_ops.shape(values)
+    values = array_ops.reshape(values, (shape[0], 1, -1))
+
+    # Add noise or quantize.
+    if training:
+      noise = random_ops.random_uniform(array_ops.shape(values), -half, half)
+      values = math_ops.add_n([values, noise])
+    elif self.optimize_integer_offset:
+      values = math_ops.round(values - self._medians) + self._medians
+    else:
+      values = math_ops.round(values)
+
+    # Evaluate densities.
+    # We can use the special rule below to only compute differences in the left
+    # tail of the sigmoid. This increases numerical stability: sigmoid(x) is 1
+    # for large x, 0 for small x. Subtracting two numbers close to 0 can be done
+    # with much higher precision than subtracting two numbers close to 1.
+    lower = self._logits_cumulative(values - half, stop_gradient=False)
+    upper = self._logits_cumulative(values + half, stop_gradient=False)
+    # Flip signs if we can move more towards the left tail of the sigmoid.
+    sign = -math_ops.sign(math_ops.add_n([lower, upper]))
+    sign = array_ops.stop_gradient(sign)
+    likelihood = abs(
+        math_ops.sigmoid(sign * upper) - math_ops.sigmoid(sign * lower))
+    if self.likelihood_bound > 0:
+      likelihood_bound = constant_op.constant(
+          self.likelihood_bound, dtype=self.dtype)
+      # TODO(jballe): Override gradients.
+      likelihood = math_ops.maximum(likelihood, likelihood_bound)
+
+    # Convert back to input tensor shape.
+    order = list(range(1, ndim))
+    order.insert(channel_axis, 0)
+    values = array_ops.reshape(values, shape)
+    values = array_ops.transpose(values, order)
+    likelihood = array_ops.reshape(likelihood, shape)
+    likelihood = array_ops.transpose(likelihood, order)
+
+    if not context.executing_eagerly():
+      values_shape, likelihood_shape = self.compute_output_shape(inputs.shape)
+      values.set_shape(values_shape)
+      likelihood.set_shape(likelihood_shape)
+
+    return values, likelihood
+
+  def compress(self, inputs):
+    """Compress inputs and store their binary representations into strings.
+
+    Args:
+      inputs: `Tensor` with values to be compressed.
+
+    Returns:
+      String `Tensor` vector containing the compressed representation of each
+      batch element of `inputs`.
+    """
+    with ops.name_scope(self._name_scope()):
+      inputs = ops.convert_to_tensor(inputs)
+      if not self.built:
+        # Check input assumptions set before layer building, e.g. input rank.
+        self._assert_input_compatibility(inputs)
+        if self.dtype is None:
+          self._dtype = inputs.dtype.base_dtype.name
+        self.build(inputs.shape)
+
+      # Check input assumptions set after layer building, e.g. input shape.
+      if not context.executing_eagerly():
+        self._assert_input_compatibility(inputs)
+
+      ndim = self.input_spec.ndim
+      channel_axis = self._channel_axis(ndim)
+      # Tuple of slices for expanding dimensions of tensors below.
+      slices = ndim * [None] + [slice(None)]
+      slices[channel_axis] = slice(None)
+      slices = tuple(slices)
+
+      # Expand dimensions of CDF to input dimensions, keeping the channels along
+      # the right dimension.
+      cdf = self._quantized_cdf[slices[1:]]
+      num_levels = array_ops.shape(cdf)[-1] - 1
+
+      # Bring inputs to the right range by centering the range on the medians.
+      half = constant_op.constant(.5, dtype=self.dtype)
+      medians = array_ops.squeeze(self._medians, [1, 2])
+      offsets = (math_ops.cast(num_levels // 2, self.dtype) + half) - medians
+      # Expand offsets to input dimensions and add to inputs.
+      values = inputs + offsets[slices[:-1]]
+
+      # Clip to range and cast to integers. Because we have added .5 above, and
+      # all values are positive, the cast effectively implements rounding.
+      values = math_ops.maximum(values, half)
+      values = math_ops.minimum(
+          values, math_ops.cast(num_levels, self.dtype) - half)
+      values = math_ops.cast(values, dtypes.int16)
+
+      def loop_body(tensor):
+        return coder_ops.range_encode(
+            tensor, cdf, precision=self.range_coder_precision)
+      strings = functional_ops.map_fn(
+          loop_body, values, dtype=dtypes.string, back_prop=False)
+
+      if not context.executing_eagerly():
+        strings.set_shape(inputs.shape[:1])
+
+      return strings
+
+  def decompress(self, strings, shape, channels=None):
+    """Decompress values from their compressed string representations.
+
+    Args:
+      strings: A string `Tensor` vector containing the compressed data.
+      shape: A `Tensor` vector of int32 type. Contains the shape of the tensor
+        to be decompressed, excluding the batch dimension.
+      channels: Integer. Specifies the number of channels statically. Needs only
+        be set if the layer hasn't been built yet (i.e., this is the first input
+        it receives).
+
+    Returns:
+      The decompressed `Tensor`. Its shape will be equal to `shape` prepended
+      with the batch dimension from `strings`.
+
+    Raises:
+      ValueError: If the length of `shape` isn't available at graph construction
+        time.
+    """
+    with ops.name_scope(self._name_scope()):
+      strings = ops.convert_to_tensor(strings)
+      shape = ops.convert_to_tensor(shape)
+      if self.built:
+        ndim = self.input_spec.ndim
+        channel_axis = self._channel_axis(ndim)
+        if channels is None:
+          channels = self.input_spec.axes[channel_axis]
+      else:
+        if not (shape.shape.is_fully_defined() and shape.shape.ndims == 1):
+          raise ValueError("`shape` must be a vector with known length.")
+        ndim = shape.shape[0].value + 1
+        channel_axis = self._channel_axis(ndim)
+        input_shape = ndim * [None]
+        input_shape[channel_axis] = channels
+        self.build(input_shape)
+
+      # Tuple of slices for expanding dimensions of tensors below.
+      slices = ndim * [None] + [slice(None)]
+      slices[channel_axis] = slice(None)
+      slices = tuple(slices)
+
+      # Expand dimensions of CDF to input dimensions, keeping the channels along
+      # the right dimension.
+      cdf = self._quantized_cdf[slices[1:]]
+      num_levels = array_ops.shape(cdf)[-1] - 1
+
+      def loop_body(string):
+        return coder_ops.range_decode(
+            string, shape, cdf, precision=self.range_coder_precision)
+      outputs = functional_ops.map_fn(
+          loop_body, strings, dtype=dtypes.int16, back_prop=False)
+      outputs = math_ops.cast(outputs, self.dtype)
+
+      medians = array_ops.squeeze(self._medians, [1, 2])
+      offsets = math_ops.cast(num_levels // 2, self.dtype) - medians
+      outputs -= offsets[slices[:-1]]
+
+      if not context.executing_eagerly():
+        outputs_shape = ndim * [None]
+        outputs_shape[0] = strings.shape[0]
+        outputs_shape[channel_axis] = channels
+        outputs.set_shape(outputs_shape)
+
+      return outputs
+
+  def visualize(self):
+    """Multi-channel visualization of densities as images.
+
+    Creates and returns an image summary visualizing the current probabilty
+    density estimates. The image contains one row for each channel. Within each
+    row, the pixel intensities are proportional to probability values, and each
+    row is centered on the median of the corresponding distribution.
+
+    Returns:
+      The created image summary.
+    """
+    with ops.name_scope(self._name_scope()):
+      image = self._pmf
+      image *= 255 / math_ops.reduce_max(image, axis=1, keepdims=True)
+      image = math_ops.cast(image + .5, dtypes.uint8)
+      image = image[None, :, :, None]
+    return summary.image("pmf", image, max_outputs=1)
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    return input_shape, input_shape
diff --git a/tensorflow/contrib/coder/python/layers/entropybottleneck_test.py b/tensorflow/contrib/coder/python/layers/entropybottleneck_test.py
new file mode 100644
index 0000000000..798b0234eb
--- /dev/null
+++ b/tensorflow/contrib/coder/python/layers/entropybottleneck_test.py
@@ -0,0 +1,315 @@
+# -*- coding: utf-8 -*-
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests of EntropyBottleneck class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.coder.python.layers import entropybottleneck
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+
+
+class EntropyBottleneckTest(test.TestCase):
+
+  def test_noise(self):
+    # Tests that the noise added is uniform noise between -0.5 and 0.5.
+    inputs = array_ops.placeholder(dtypes.float32, (None, 1))
+    layer = entropybottleneck.EntropyBottleneck()
+    noisy, _ = layer(inputs, training=True)
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      values = np.linspace(-50, 50, 100)[:, None]
+      noisy, = sess.run([noisy], {inputs: values})
+      self.assertFalse(np.allclose(values, noisy, rtol=0, atol=.49))
+      self.assertAllClose(values, noisy, rtol=0, atol=.5)
+
+  def test_quantization(self):
+    # Tests that inputs are quantized to full integer values, even after
+    # quantiles have been updated.
+    inputs = array_ops.placeholder(dtypes.float32, (None, 1))
+    layer = entropybottleneck.EntropyBottleneck(optimize_integer_offset=False)
+    quantized, _ = layer(inputs, training=False)
+    opt = gradient_descent.GradientDescentOptimizer(learning_rate=1)
+    self.assertTrue(len(layer.losses) == 1)
+    step = opt.minimize(layer.losses[0])
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      sess.run(step)
+      values = np.linspace(-50, 50, 100)[:, None]
+      quantized, = sess.run([quantized], {inputs: values})
+      self.assertAllClose(np.around(values), quantized, rtol=0, atol=1e-6)
+
+  def test_quantization_optimized_offset(self):
+    # Tests that inputs are not quantized to full integer values after quantiles
+    # have been updated. However, the difference between input and output should
+    # be between -0.5 and 0.5, and the offset must be consistent.
+    inputs = array_ops.placeholder(dtypes.float32, (None, 1))
+    layer = entropybottleneck.EntropyBottleneck(optimize_integer_offset=True)
+    quantized, _ = layer(inputs, training=False)
+    opt = gradient_descent.GradientDescentOptimizer(learning_rate=1)
+    self.assertTrue(len(layer.losses) == 1)
+    step = opt.minimize(layer.losses[0])
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      sess.run(step)
+      values = np.linspace(-50, 50, 100)[:, None]
+      quantized, = sess.run([quantized], {inputs: values})
+      self.assertAllClose(values, quantized, rtol=0, atol=.5)
+      diff = np.ravel(np.around(values) - quantized) % 1
+      self.assertAllClose(diff, np.full_like(diff, diff[0]), rtol=0, atol=5e-6)
+      self.assertNotEqual(diff[0], 0)
+
+  def test_codec(self):
+    # Tests that inputs are compressed and decompressed correctly, and quantized
+    # to full integer values, even after quantiles have been updated.
+    inputs = array_ops.placeholder(dtypes.float32, (1, None, 1))
+    layer = entropybottleneck.EntropyBottleneck(
+        data_format="channels_last", init_scale=60,
+        optimize_integer_offset=False)
+    bitstrings = layer.compress(inputs)
+    decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:])
+    opt = gradient_descent.GradientDescentOptimizer(learning_rate=1)
+    self.assertTrue(len(layer.losses) == 1)
+    step = opt.minimize(layer.losses[0])
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      sess.run(step)
+      self.assertTrue(len(layer.updates) == 1)
+      sess.run(layer.updates[0])
+      values = np.linspace(-50, 50, 100)[None, :, None]
+      decoded, = sess.run([decoded], {inputs: values})
+      self.assertAllClose(np.around(values), decoded, rtol=0, atol=1e-6)
+
+  def test_codec_optimized_offset(self):
+    # Tests that inputs are compressed and decompressed correctly, and not
+    # quantized to full integer values after quantiles have been updated.
+    # However, the difference between input and output should be between -0.5
+    # and 0.5, and the offset must be consistent.
+    inputs = array_ops.placeholder(dtypes.float32, (1, None, 1))
+    layer = entropybottleneck.EntropyBottleneck(
+        data_format="channels_last", init_scale=60,
+        optimize_integer_offset=True)
+    bitstrings = layer.compress(inputs)
+    decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:])
+    opt = gradient_descent.GradientDescentOptimizer(learning_rate=1)
+    self.assertTrue(len(layer.losses) == 1)
+    step = opt.minimize(layer.losses[0])
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      sess.run(step)
+      self.assertTrue(len(layer.updates) == 1)
+      sess.run(layer.updates[0])
+      values = np.linspace(-50, 50, 100)[None, :, None]
+      decoded, = sess.run([decoded], {inputs: values})
+      self.assertAllClose(values, decoded, rtol=0, atol=.5)
+      diff = np.ravel(np.around(values) - decoded) % 1
+      self.assertAllClose(diff, np.full_like(diff, diff[0]), rtol=0, atol=5e-6)
+      self.assertNotEqual(diff[0], 0)
+
+  def test_codec_clipping(self):
+    # Tests that inputs are compressed and decompressed correctly, and clipped
+    # to the expected range.
+    inputs = array_ops.placeholder(dtypes.float32, (1, None, 1))
+    layer = entropybottleneck.EntropyBottleneck(
+        data_format="channels_last", init_scale=40)
+    bitstrings = layer.compress(inputs)
+    decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:])
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      self.assertTrue(len(layer.updates) == 1)
+      sess.run(layer.updates[0])
+      values = np.linspace(-50, 50, 100)[None, :, None]
+      decoded, = sess.run([decoded], {inputs: values})
+      expected = np.clip(np.around(values), -40, 40)
+      self.assertAllClose(expected, decoded, rtol=0, atol=1e-6)
+
+  def test_channels_last(self):
+    # Test the layer with more than one channel and multiple input dimensions,
+    # with the channels in the last dimension.
+    inputs = array_ops.placeholder(dtypes.float32, (None, None, None, 2))
+    layer = entropybottleneck.EntropyBottleneck(
+        data_format="channels_last", init_scale=50)
+    noisy, _ = layer(inputs, training=True)
+    quantized, _ = layer(inputs, training=False)
+    bitstrings = layer.compress(inputs)
+    decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:])
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      self.assertTrue(len(layer.updates) == 1)
+      sess.run(layer.updates[0])
+      values = 5 * np.random.normal(size=(7, 5, 3, 2))
+      noisy, quantized, decoded = sess.run(
+          [noisy, quantized, decoded], {inputs: values})
+      self.assertAllClose(values, noisy, rtol=0, atol=.5)
+      self.assertAllClose(values, quantized, rtol=0, atol=.5)
+      self.assertAllClose(values, decoded, rtol=0, atol=.5)
+
+  def test_channels_first(self):
+    # Test the layer with more than one channel and multiple input dimensions,
+    # with the channel dimension right after the batch dimension.
+    inputs = array_ops.placeholder(dtypes.float32, (None, 3, None, None))
+    layer = entropybottleneck.EntropyBottleneck(
+        data_format="channels_first", init_scale=50)
+    noisy, _ = layer(inputs, training=True)
+    quantized, _ = layer(inputs, training=False)
+    bitstrings = layer.compress(inputs)
+    decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:])
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      self.assertTrue(len(layer.updates) == 1)
+      sess.run(layer.updates[0])
+      values = 5 * np.random.normal(size=(2, 3, 5, 7))
+      noisy, quantized, decoded = sess.run(
+          [noisy, quantized, decoded], {inputs: values})
+      self.assertAllClose(values, noisy, rtol=0, atol=.5)
+      self.assertAllClose(values, quantized, rtol=0, atol=.5)
+      self.assertAllClose(values, decoded, rtol=0, atol=.5)
+
+  def test_compress(self):
+    # Test compression and decompression, and produce test data for
+    # `test_decompress`. If you set the constant at the end to `True`, this test
+    # will fail and the log will contain the new test data.
+    inputs = array_ops.placeholder(dtypes.float32, (2, 3, 10))
+    layer = entropybottleneck.EntropyBottleneck(
+        data_format="channels_first", filters=(), init_scale=2)
+    bitstrings = layer.compress(inputs)
+    decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:])
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      self.assertTrue(len(layer.updates) == 1)
+      sess.run(layer.updates[0])
+      values = 5 * np.random.uniform(size=(2, 3, 10)) - 2.5
+      bitstrings, quantized_cdf, decoded = sess.run(
+          [bitstrings, layer._quantized_cdf, decoded], {inputs: values})
+      self.assertAllClose(values, decoded, rtol=0, atol=.5)
+      # Set this constant to `True` to log new test data for `test_decompress`.
+      if False:  # pylint:disable=using-constant-test
+        assert False, (bitstrings, quantized_cdf, decoded)
+
+  # Data generated by `test_compress`.
+  # pylint:disable=g-inconsistent-quotes,bad-whitespace
+  bitstrings = np.array([
+      b'\x1e\xbag}\xc2\xdaN\x8b\xbd.',
+      b'\x8dF\xf0%\x1cv\xccllW'
+  ], dtype=object)
+
+  quantized_cdf = np.array([
+      [    0, 15636, 22324, 30145, 38278, 65536],
+      [    0, 19482, 26927, 35052, 42904, 65535],
+      [    0, 21093, 28769, 36919, 44578, 65536]
+  ], dtype=np.int32)
+
+  expected = np.array([
+      [[-2.,  1.,  0., -2., -1., -2., -2., -2.,  2., -1.],
+       [ 1.,  2.,  1.,  0., -2., -2.,  1.,  2.,  0.,  1.],
+       [ 2.,  0., -2.,  2.,  0., -1., -2.,  0.,  2.,  0.]],
+      [[ 1.,  2.,  0., -1.,  1.,  2.,  1.,  1.,  2., -2.],
+       [ 2., -1., -1.,  0., -1.,  2.,  0.,  2., -2.,  2.],
+       [ 2., -2., -2., -1., -2.,  1., -2.,  0.,  0.,  0.]]
+  ], dtype=np.float32)
+  # pylint:enable=g-inconsistent-quotes,bad-whitespace
+
+  def test_decompress(self):
+    # Test that decompression of values compressed with a previous version
+    # works, i.e. that the file format doesn't change across revisions.
+    bitstrings = array_ops.placeholder(dtypes.string)
+    input_shape = array_ops.placeholder(dtypes.int32)
+    quantized_cdf = array_ops.placeholder(dtypes.int32)
+    layer = entropybottleneck.EntropyBottleneck(
+        data_format="channels_first", filters=(), dtype=dtypes.float32)
+    layer.build(self.expected.shape)
+    layer._quantized_cdf = quantized_cdf
+    decoded = layer.decompress(bitstrings, input_shape[1:])
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      decoded, = sess.run([decoded], {
+          bitstrings: self.bitstrings, input_shape: self.expected.shape,
+          quantized_cdf: self.quantized_cdf})
+      self.assertAllClose(self.expected, decoded, rtol=0, atol=1e-6)
+
+  def test_build_decompress(self):
+    # Test that layer can be built when `decompress` is the first call to it.
+    bitstrings = array_ops.placeholder(dtypes.string)
+    input_shape = array_ops.placeholder(dtypes.int32, shape=[3])
+    layer = entropybottleneck.EntropyBottleneck(dtype=dtypes.float32)
+    layer.decompress(bitstrings, input_shape[1:], channels=5)
+    self.assertTrue(layer.built)
+
+  def test_pmf_normalization(self):
+    # Test that probability mass functions are normalized correctly.
+    layer = entropybottleneck.EntropyBottleneck(dtype=dtypes.float32)
+    layer.build((None, 10))
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      pmf, = sess.run([layer._pmf])
+      self.assertAllClose(np.ones(10), np.sum(pmf, axis=-1), rtol=0, atol=1e-6)
+
+  def test_visualize(self):
+    # Test that summary op can be constructed.
+    layer = entropybottleneck.EntropyBottleneck(dtype=dtypes.float32)
+    layer.build((None, 10))
+    summary = layer.visualize()
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      sess.run([summary])
+
+  def test_normalization(self):
+    # Test that densities are normalized correctly.
+    inputs = array_ops.placeholder(dtypes.float32, (None, 1))
+    layer = entropybottleneck.EntropyBottleneck(filters=(2,))
+    _, likelihood = layer(inputs, training=True)
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      x = np.repeat(np.arange(-200, 201), 1000)[:, None]
+      likelihood, = sess.run([likelihood], {inputs: x})
+      self.assertEqual(x.shape, likelihood.shape)
+      integral = np.sum(likelihood) * .001
+      self.assertAllClose(1, integral, rtol=0, atol=1e-4)
+
+  def test_entropy_estimates(self):
+    # Test that entropy estimates match actual range coding.
+    inputs = array_ops.placeholder(dtypes.float32, (1, None, 1))
+    layer = entropybottleneck.EntropyBottleneck(
+        filters=(2, 3), data_format="channels_last")
+    _, likelihood = layer(inputs, training=True)
+    diff_entropy = math_ops.reduce_sum(math_ops.log(likelihood)) / -np.log(2)
+    _, likelihood = layer(inputs, training=False)
+    disc_entropy = math_ops.reduce_sum(math_ops.log(likelihood)) / -np.log(2)
+    bitstrings = layer.compress(inputs)
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      self.assertTrue(len(layer.updates) == 1)
+      sess.run(layer.updates[0])
+      diff_entropy, disc_entropy, bitstrings = sess.run(
+          [diff_entropy, disc_entropy, bitstrings],
+          {inputs: np.random.normal(size=(1, 10000, 1))})
+      codelength = 8 * sum(len(bitstring) for bitstring in bitstrings)
+      self.assertAllClose(diff_entropy, disc_entropy, rtol=5e-3, atol=0)
+      self.assertAllClose(disc_entropy, codelength, rtol=5e-3, atol=0)
+      self.assertGreater(codelength, disc_entropy)
+
+
+if __name__ == "__main__":
+  test.main()
-- 
GitLab


From 8e544335e15029ccccbe743ee0fefaa344b62e4e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 13:28:01 -0700
Subject: [PATCH 314/450] Remove unused function from FunctionDefLibrary.

PiperOrigin-RevId: 193974712
---
 .../grappler/optimizers/function_optimizer.cc | 126 +++++++++++++++---
 .../grappler/optimizers/function_optimizer.h  |   6 +-
 .../optimizers/function_optimizer_test.cc     |  32 ++---
 .../grappler/optimizers/meta_optimizer.cc     |   6 +-
 tensorflow/core/grappler/utils/functions.cc   |  12 +-
 tensorflow/core/grappler/utils/functions.h    |  40 ++++--
 .../core/grappler/utils/functions_test.cc     |   8 +-
 7 files changed, 163 insertions(+), 67 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index d008a9719f..47e7dc0a96 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -75,12 +76,10 @@ string UniqueSpecializedFunctionName(const FunctionDef& func,
 
 class FunctionOptimizerContext {
  public:
-  explicit FunctionOptimizerContext(const GrapplerItem& item,
-                                    RewriterConfig::Toggle opt_level)
-      : opt_level_(opt_level),
-        function_library_(FunctionLibraryDefinition(OpRegistry::Global(),
-                                                    item.graph.library())) {
-    InitializeInlinedFunctions(item);
+  explicit FunctionOptimizerContext(RewriterConfig::Toggle opt_level,
+                                    const GrapplerItem& item)
+      : function_library_(OpRegistry::Global(), item.graph.library()) {
+    InitializeInlinedFunctions(opt_level, item);
   }
 
   const FunctionLibraryDefinition& function_library() const {
@@ -101,8 +100,9 @@ class FunctionOptimizerContext {
   }
 
  private:
-  void InitializeInlinedFunctions(const GrapplerItem& item) {
-    bool aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
+  void InitializeInlinedFunctions(RewriterConfig::Toggle opt_level,
+                                  const GrapplerItem& item) {
+    bool aggressive = opt_level == RewriterConfig::AGGRESSIVE;
 
     for (const FunctionDef& func : item.graph.library().function()) {
       // Can't create IdentityN nodes with no input or output: skip these
@@ -120,7 +120,6 @@ class FunctionOptimizerContext {
     }
   }
 
-  RewriterConfig::Toggle opt_level_;
   FunctionLibraryDefinition function_library_;
   // Functions that can be inlined into optimized graph.
   std::unordered_map<string, const FunctionDef*> inlined_functions_;
@@ -128,9 +127,93 @@ class FunctionOptimizerContext {
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionOptimizerContext);
 };
 
+// Return trimmed FunctionDefLibrary with functions that are reachable from
+// the optimized graph.
+FunctionDefLibrary TrimFunctionLibrary(const FunctionLibraryDefinition& flib,
+                                       const GraphDef& optimized_graph) {
+  // Functions that are reachable from the optimized graph.
+  std::unordered_set<string> keep_funcs;
+
+  std::vector<const FunctionDef*> func_queue;
+  func_queue.reserve(flib.num_functions());
+
+  // Add registered and not already processed functions to the queue by name.
+  const auto add_to_func_queue = [&](const string& func_name) {
+    const FunctionDef* func = flib.Find(func_name);
+    if (func && keep_funcs.find(func_name) == keep_funcs.end()) {
+      func_queue.push_back(func);
+    }
+  };
+
+  // Find all the functions that are reachable from the given node.
+  const auto add_node_to_func_queue = [&](const NodeDef& node) {
+    // Node itself can be a call to the function.
+    add_to_func_queue(node.op());
+
+    // Or node can have an attribute referencing a function.
+    for (const auto& attr : node.attr()) {
+      const auto& attr_value = attr.second;
+
+      // 1. AttrValue.func
+      if (attr_value.has_func()) {
+        add_to_func_queue(attr_value.func().name());
+      }
+
+      // 2. AttrValue.ListValue.func
+      if (attr_value.has_list()) {
+        for (const auto& func : attr_value.list().func()) {
+          add_to_func_queue(func.name());
+        }
+      }
+    }
+  };
+
+  // Add all functions that are directly called from the optimized graph.
+  const auto& graph_nodes = optimized_graph.node();
+  std::for_each(graph_nodes.begin(), graph_nodes.end(), add_node_to_func_queue);
+
+  // Process all reachable functions.
+  while (!func_queue.empty()) {
+    const FunctionDef* func = func_queue.back();
+    func_queue.pop_back();
+
+    const string& func_name = func->signature().name();
+    keep_funcs.insert(func_name);
+
+    // Find all the functions called from the function body.
+    const auto& func_body = func->node_def();
+    std::for_each(func_body.begin(), func_body.end(), add_node_to_func_queue);
+
+    // Check if the function has a registered gradient.
+    const string grad_func_name = flib.FindGradient(func_name);
+    if (!grad_func_name.empty()) add_to_func_queue(grad_func_name);
+  }
+
+  FunctionDefLibrary lib;
+  for (const string& func_name : keep_funcs) {
+    const FunctionDef* func = CHECK_NOTNULL(flib.Find(func_name));
+    *lib.add_function() = *func;
+
+    const string grad_func_name = flib.FindGradient(func_name);
+    if (!grad_func_name.empty()) {
+      GradientDef* gd = lib.add_gradient();
+      gd->set_function_name(func_name);
+      gd->set_gradient_func(grad_func_name);
+    }
+  }
+
+  VLOG(3) << "Trimmed function library: " << keep_funcs.size() << " functions ("
+          << static_cast<int>(keep_funcs.size() - flib.num_functions()) << ")";
+
+  return lib;
+}
+
 Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
                           FunctionOptimizerContext* ctx,
                           GraphDef* optimized_graph) {
+  VLOG(2) << "Specialize function instantiation: "
+          << SummarizeNodeDef(func_node);
+
   const std::unordered_map<string, AttrValue> func_attr(
       func_node.attr().begin(), func_node.attr().end());
 
@@ -141,20 +224,20 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
   TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
 
   // TODO(ezhulenev): Push down const inputs and known input shapes.
-  FunctionDef specialized;
-  TF_RETURN_IF_ERROR(MakeSpecializedFunctionDef(item, flib, &specialized));
+  FunctionDef specialized_func;
+  TF_RETURN_IF_ERROR(MakeFunctionDef(item, flib, &specialized_func));
 
   // Find a name for specialized function.
   const string specialized_func_name =
       UniqueSpecializedFunctionName(func, func_node, flib);
 
-  specialized.mutable_signature()->set_name(specialized_func_name);
-  auto* specialized_attr = specialized.mutable_attr();
+  specialized_func.mutable_signature()->set_name(specialized_func_name);
+  auto* specialized_attr = specialized_func.mutable_attr();
   (*specialized_attr)[kGrapplerSpecializedFuncAttr].set_b(true);
 
   // Add specialized function to the library.
   TF_RETURN_IF_ERROR(
-      ctx->mutable_function_library().AddFunctionDef(specialized));
+      ctx->mutable_function_library().AddFunctionDef(specialized_func));
 
   // Add a function call node for the specialized function.
   NodeDef* specialized_func_node = optimized_graph->add_node();
@@ -226,6 +309,8 @@ Status HookInlinedFunctionOutputs(
 Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
                       const FunctionOptimizerContext& ctx,
                       GraphDef* optimized_graph) {
+  VLOG(2) << "Inline function instantiation: " << SummarizeNodeDef(func_node);
+
   const std::unordered_map<string, AttrValue> func_attr(
       func_node.attr().begin(), func_node.attr().end());
 
@@ -359,6 +444,8 @@ class SymbolicGradientEnv {
 
 Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
                               GraphDef* inlined_graph) {
+  VLOG(2) << "Inline symbolic gradient: " << SummarizeNodeDef(node);
+
   GraphDef graph_def;
 
   // Create a node to anchor the gradient inputs
@@ -454,13 +541,16 @@ Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
 
 Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                    GraphDef* optimized_graph) {
+  VLOG(1) << "Optimize Grappler item: id=" << item.id;
+
   // Nothing to do here.
   if (item.graph.library().function_size() == 0) {
+    VLOG(3) << "Skip Grappler item with empty function library";
     *optimized_graph = item.graph;
     return Status::OK();
   }
 
-  FunctionOptimizerContext ctx(item, opt_level_);
+  FunctionOptimizerContext ctx(opt_level_, item);
   SymbolicGradientEnv env(item.graph.versions().producer(),
                           item.graph.library());
 
@@ -506,9 +596,11 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     *optimized_graph->add_node() = node;
   }
 
-  // TODO(bsteiner): trim the library to remove unused function definitions
   *optimized_graph->mutable_versions() = item.graph.versions();
-  *optimized_graph->mutable_library() = ctx.function_library().ToProto();
+  *optimized_graph->mutable_library() =
+      options_.enable_trim_function_library
+          ? TrimFunctionLibrary(ctx.function_library(), *optimized_graph)
+          : ctx.function_library().ToProto();
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.h b/tensorflow/core/grappler/optimizers/function_optimizer.h
index c555fadf83..e307b4e533 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.h
@@ -26,8 +26,9 @@ namespace grappler {
 // operations to make the overall graph more efficient.
 class FunctionOptimizer : public GraphOptimizer {
  public:
-  FunctionOptimizer(RewriterConfig::Toggle opt_level) : opt_level_(opt_level) {}
-  ~FunctionOptimizer() override {}
+  explicit FunctionOptimizer(RewriterConfig::Toggle opt_level)
+      : opt_level_(opt_level) {}
+  ~FunctionOptimizer() override = default;
 
   string name() const override { return "function_optimizer"; };
 
@@ -44,6 +45,7 @@ class FunctionOptimizer : public GraphOptimizer {
     bool enable_function_inlining = true;
     bool enable_function_specialization = true;
     bool enable_symbolic_gradient_inlining = true;
+    bool enable_trim_function_library = true;
   };
 
   RewriterConfig::Toggle opt_level_;
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index fb006d4868..6147e8a27c 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -31,20 +31,8 @@ constexpr char kDevice[] = "/device:CPU:0";
 
 class FunctionOptimizerTest : public GrapplerTest {
  protected:
-  void DisableAll(FunctionOptimizer* optimizer) {
-    optimizer->options_.enable_function_inlining = false;
+  void DisableFunctionSpecialization(FunctionOptimizer* optimizer) {
     optimizer->options_.enable_function_specialization = false;
-    optimizer->options_.enable_symbolic_gradient_inlining = false;
-  }
-
-  void EnableOnlyFunctionInlining(FunctionOptimizer* optimizer) {
-    DisableAll(optimizer);
-    optimizer->options_.enable_function_inlining = true;
-  }
-
-  void EnableOnlyFunctionSpecialization(FunctionOptimizer* optimizer) {
-    DisableAll(optimizer);
-    optimizer->options_.enable_function_specialization = true;
   }
 };
 
@@ -352,7 +340,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithoutInput) {
   using test::function::NDef;
 
   FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
-  EnableOnlyFunctionInlining(&optimizer);
+  DisableFunctionSpecialization(&optimizer);  // do not specialize noinline func
 
   const Tensor kTwo = test::AsScalar<int64>(2);
   FunctionDef func = FunctionDefHelper::Define(
@@ -626,14 +614,13 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) {
   using test::function::NDef;
 
   FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
-  EnableOnlyFunctionSpecialization(&optimizer);
 
-  // Mark XTimesTwo as noinline
+  // Mark XTimesTwo as noinline.
   FunctionDef x_times_two = test::function::XTimesTwo();
   (*x_times_two.mutable_attr())["_noinline"].set_b(true);
   std::vector<FunctionDef> function_library = {x_times_two};
 
-  // Build a graph to compute y = XTimesTwo(x)
+  // Build a graph to compute y = XTimesTwo(x).
   GrapplerItem item;
   item.graph = test::function::GDef(
       {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
@@ -644,12 +631,13 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) {
   GraphDef output;
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  // Make sure that specialized function was added to the library
-  EXPECT_EQ(2, output.library().function_size());
+  // Make sure that specialized function was added to the library and original
+  // function was removed.
+  EXPECT_EQ(1, output.library().function_size());
   EXPECT_EQ("XTimesTwo_specialized_for_y",
-            output.library().function(1).signature().name());
+            output.library().function(0).signature().name());
 
-  // And 'y' node is calling specialized function
+  // And 'y' node is calling specialized function.
   int count = 0;
   for (const NodeDef& node : output.node()) {
     if (node.name() == "y" && count++) {
@@ -658,7 +646,7 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) {
   }
   EXPECT_EQ(1, count);
 
-  // And that graph evaluation yields the same result
+  // And that graph evaluation yields the same result.
   Tensor pi = test::AsScalar<float>(3.14f);
   item.fetch = {"z"};
   item.feed.emplace_back("x", pi);
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 558b8a77e8..335fb403f1 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -219,11 +219,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   if (already_optimized) {
     TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph));
     ReassignColocation(optimized_graph);
-    // Make sure that the optimizers preserved the graph version and library.
-    DCHECK_GE(optimized_graph->library().function_size(),
-              item.graph.library().function_size());
-    DCHECK_GE(optimized_graph->library().gradient_size(),
-              item.graph.library().gradient_size());
+    // Make sure that the optimizers preserved the graph version.
     DCHECK_EQ(optimized_graph->versions().producer(),
               item.graph.versions().producer());
   }
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index 638fe1999a..790809bc67 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -545,6 +545,12 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
   return Status::OK();
 }
 
+Status MakeGrapplerFunctionItem(const FunctionDef& func,
+                                const FunctionLibraryDefinition& flib,
+                                GrapplerFunctionItem* item) {
+  return MakeGrapplerFunctionItem(func, AttrValueMap(), flib, item);
+}
+
 // Register GrapplerFunctionItem input arg expansion and function body outputs
 // in the GrapplerFunctionConnectivity.
 Status RegisterGrapplerFunctionConnectivity(
@@ -560,9 +566,9 @@ Status RegisterGrapplerFunctionConnectivity(
   return Status::OK();
 }
 
-Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item,
-                                  const FunctionLibraryDefinition& flib,
-                                  FunctionDef* func) {
+Status MakeFunctionDef(const GrapplerFunctionItem& item,
+                       const FunctionLibraryDefinition& flib,
+                       FunctionDef* func) {
   func->mutable_signature()->set_name(item.id);
   func->mutable_signature()->set_is_stateful(item.is_stateful());
 
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index ab369bcad7..5e8b6c6960 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -38,7 +38,8 @@ using AttrValueMap = std::unordered_map<string, AttrValue>;
 // function body in place of function inputs and a resolved input data type.
 struct InputArgExpansion {
   // TODO(ezhulenev): Add support for functions with tensor sequence inputs of
-  // different data types
+  // different data types.
+  // TODO(ezhulenev): Support type parametrized inputs?
   string input_name;                 // name of the function input argument
   DataType data_type;                // input data type
   bool is_ref;                       // if true, inputs are required to be refs
@@ -53,7 +54,8 @@ struct InputArgExpansion {
 // tensors of a function body nodes and a resolved output data type
 struct OutputArgExpansion {
   // TODO(ezhulenev): Add support for functions with tensor sequence outputs of
-  // different data types
+  // different data types.
+  // TODO(ezhulenev): Support type parametrized outputs?
   string output_name;                  // name of the function output argument
   DataType data_type;                  // output data type
   bool is_ref;                         // if true, outputs are refs
@@ -186,13 +188,6 @@ bool HasParametrizedBody(const FunctionDef& func);
 // Check if function has parametrized type or body.
 bool IsParametrized(const FunctionDef& func);
 
-// Make a GrapplerFunctionItem from the function definition and attributes.
-// Return error if the given function def cannot be converted.
-Status MakeGrapplerFunctionItem(
-    const FunctionDef& func,
-    const std::unordered_map<string, AttrValue>& func_instantiation_attr,
-    const FunctionLibraryDefinition& flib, GrapplerFunctionItem* item);
-
 // Register GrapplerFunctionItem input arg expansion and function body outputs
 // in the GrapplerFunctionConnectivity.  Use function library definition to
 // lookup function body nodes output names and ranges.
@@ -200,11 +195,28 @@ Status RegisterGrapplerFunctionConnectivity(
     const GrapplerFunctionItem& item, const FunctionLibraryDefinition& flib,
     GrapplerFunctionConnectivity* connectivity);
 
-// Make a specialized FunctionDef from the GrapplerFunctionItem. Use function
-// library definition to lookup function body nodes output names and ranges.
-Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item,
-                                  const FunctionLibraryDefinition& flib,
-                                  FunctionDef* func);
+// Make a GrapplerFunctionItem from the function definition and function
+// instantiation attributes (caller node attributes). Returns error if the given
+// function def cannot be converted (e.g. not all attributes are defined).
+Status MakeGrapplerFunctionItem(
+    const FunctionDef& func,
+    const std::unordered_map<string, AttrValue>& func_instantiation_attr,
+    const FunctionLibraryDefinition& flib, GrapplerFunctionItem* item);
+
+// Make a GrapplerFunction item from the function definition. Function must be
+// fully defined (no type or body parametrization).
+// TODO(ezhulenev): Support parametrized functions without fully defined
+// instantiation attributes? Do we ever want to optimize parametrized function
+// without specializing it to it's instantiation attributes (at least types)?
+Status MakeGrapplerFunctionItem(const FunctionDef& func,
+                                const FunctionLibraryDefinition& flib,
+                                GrapplerFunctionItem* item);
+
+// Make a FunctionDef from the GrapplerFunctionItem. Use function library
+// definition to lookup function body nodes output names and ranges.
+Status MakeFunctionDef(const GrapplerFunctionItem& item,
+                       const FunctionLibraryDefinition& flib,
+                       FunctionDef* func);
 
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 54d235a8a4..6dfd49b943 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -524,7 +524,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
   EXPECT_EQ("two", cast.input(0));
 }
 
-TEST_F(FunctionsTest, MakeSpecializedFunctionDef) {
+TEST_F(FunctionsTest, MakeFunctionDef) {
   const Tensor kTwo = test::AsScalar<int64>(2);
   FunctionDef func = FunctionDefHelper::Define(
       // Name
@@ -550,7 +550,7 @@ TEST_F(FunctionsTest, MakeSpecializedFunctionDef) {
   TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
 
   FunctionDef specialized;
-  TF_EXPECT_OK(MakeSpecializedFunctionDef(item, flib, &specialized));
+  TF_EXPECT_OK(MakeFunctionDef(item, flib, &specialized));
 
   // Input and output types are resolved based on instantiation attributes.
   EXPECT_EQ("x", specialized.signature().input_arg(0).name());
@@ -573,7 +573,7 @@ TEST_F(FunctionsTest, MakeSpecializedFunctionDef) {
   EXPECT_EQ(2, count);
 }
 
-TEST_F(FunctionsTest, SwapFunctionBodyAndMakeSpecializedFunctionDef) {
+TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) {
   using test::function::NDef;
 
   FunctionDef mul_func = FunctionDefHelper::Create(
@@ -606,7 +606,7 @@ TEST_F(FunctionsTest, SwapFunctionBodyAndMakeSpecializedFunctionDef) {
   // Replace function body with identity function
   item.SwapFunctionBody(std::move(id_func_body));
   FunctionDef specialized;
-  TF_EXPECT_OK(MakeSpecializedFunctionDef(item, flib, &specialized));
+  TF_EXPECT_OK(MakeFunctionDef(item, flib, &specialized));
 
   // Check that graph body was updated.
   int count = 0;
-- 
GitLab


From 19ee0605b6eadb516703c37b7ba38e7122a6c51f Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Mon, 23 Apr 2018 13:43:13 -0700
Subject: [PATCH 315/450] Updating freeze_graph dependencies.

PiperOrigin-RevId: 193977096
---
 tensorflow/python/BUILD       | 1 +
 tensorflow/python/tools/BUILD | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 698e2a28bf..9dc03d7cdb 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -70,6 +70,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = [
         "//tensorflow:__pkg__",
+        "//tensorflow/python/tools:__pkg__",
     ],
     deps = [
         ":array_ops",
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 84d20f8e36..6c34b6aaf3 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -38,9 +38,9 @@ py_library(
     deps = [
         ":saved_model_utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python:client",
         "//tensorflow/python:framework",
+        "//tensorflow/python:no_contrib",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:training",
-- 
GitLab


From 105c7df01b12b77bc17909cfb4a0d0c0aff87571 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 13:44:57 -0700
Subject: [PATCH 316/450] More relaxed size checking for TransposeConv, and
 miscellaneous bug fixes.

PiperOrigin-RevId: 193977375
---
 .../internal/optimized/optimized_ops.h        |  3 +
 .../internal/reference/reference_ops.h        |  3 +
 .../propagate_fixed_sizes.cc                  | 56 +++++++------------
 .../resolve_constant_binary.cc                |  7 ++-
 .../resolve_multiply_by_zero.cc               |  5 ++
 5 files changed, 36 insertions(+), 38 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 49ce1133d3..d585bcca0e 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -5774,6 +5774,9 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& right_paddings, T* output_data,
                 const Dims<4>& output_dims, const int32_t pad_value) {
   gemmlowp::ScopedProfilingLabel label("Pad");
+  TFLITE_DCHECK_EQ(left_paddings.size(), 4);
+  TFLITE_DCHECK_EQ(right_paddings.size(), 4);
+
   const int output_batch = ArraySize(output_dims, 3);
   const int output_height = ArraySize(output_dims, 2);
   const int output_width = ArraySize(output_dims, 1);
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index d1d4f54f86..ae295cc8b5 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -3065,6 +3065,9 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
                 const std::vector<int>& right_paddings, T* output_data,
                 const Dims<4>& output_dims, const int32_t pad_value) {
+  TFLITE_DCHECK_EQ(left_paddings.size(), 4);
+  TFLITE_DCHECK_EQ(right_paddings.size(), 4);
+
   const int output_batch = ArraySize(output_dims, 3);
   const int output_height = ArraySize(output_dims, 2);
   const int output_width = ArraySize(output_dims, 1);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index ba244cf5ef..7946492633 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -168,7 +168,9 @@ void ProcessConvOperator(Model* model, ConvOperator* op) {
     return;
   }
   const auto& input_shape = input_array.shape();
-  CHECK_EQ(input_shape.dimensions_count(), 4);
+  CHECK(input_shape.dimensions_count() == 4)
+      << "Conv ops require 4D inputs. Input array \"" << op->inputs[0]
+      << "\" is " << input_shape.dimensions_count() << "D.";
 
   const auto& weights_array = model->GetArray(op->inputs[1]);
   // Yield until weights dims have been resolved.
@@ -249,12 +251,6 @@ void ProcessTransposeConvOperator(Model* model, TransposeConvOperator* op) {
       << op->inputs[TransposeConvOperator::WEIGHTS] << "\" had shape "
       << toco::ShapeToString(weights_shape) << ".";
 
-  CHECK(weights_shape.dims(0) == 1 && weights_shape.dims(3) == 1)
-      << "TransposeConv weights dimensions must begin and end with 1. Input "
-         "weights \""
-      << op->inputs[TransposeConvOperator::WEIGHTS] << "\" had shape "
-      << toco::ShapeToString(weights_shape) << ".";
-
   // Compute padding
   const int kheight = weights_shape.dims(1);
   const int kwidth = weights_shape.dims(2);
@@ -269,9 +265,7 @@ void ProcessTransposeConvOperator(Model* model, TransposeConvOperator* op) {
     LOG(FATAL) << "TransposeConv only supports SAME or VALID padding";
   }
 
-  // VALIDATE OUTPUT SHAPE
-  // Compute the output shape from the input and weights shapes to verify it
-  // agrees with the specified output shape.
+  // VALIDATE some dimensions and set the output shape.
   const auto& input_array =
       model->GetArray(op->inputs[TransposeConvOperator::DATA_INPUT]);
   if (!input_array.has_shape()) {
@@ -283,31 +277,13 @@ void ProcessTransposeConvOperator(Model* model, TransposeConvOperator* op) {
       << "TransposeConv input shape must have 4 dimensions. Input \""
       << op->inputs[TransposeConvOperator::WEIGHTS] << "\" had shape "
       << toco::ShapeToString(weights_shape) << ".";
+  CHECK_EQ(input_shape.dims(3), weights_shape.dims(0))
+      << "Input shape depth and weight depth do not agree";
 
-  // Compute output shape
-  const int input_width = input_shape.dims(2);
-  const int input_height = input_shape.dims(1);
-  int output_height = op->stride_height * (input_height - 1);
-  int output_width = op->stride_width * (input_width - 1);
-  if (op->padding.type == PaddingType::kValid) {
-    output_height += kheight;
-    output_width += kwidth;
-  } else if (op->padding.type == PaddingType::kSame) {
-    output_height += 1;
-    output_width += 1;
-  }
-
-  CHECK(specified_output_shape_array.GetBuffer<ArrayDataType::kInt32>().data ==
-        std::vector<int32>({input_shape.dims(0), output_height, output_width,
-                            weights_shape.dims(3)}))
-      << "Specified output shape: " << ShapeToString(output_array.shape())
-      << ", does not agree with shape computed from input data and weights: ["
-      << input_shape.dims(0) << ", " << output_height << ", " << output_width
-      << ", " << weights_shape.dims(3) << "].";
-
-  // SUCCESS: Set the op's output shape according to the specified output shape.
-  *(output_array.mutable_shape()->mutable_dims()) =
+  // Set the output shape according to the specified output shape.
+  std::vector<int32> const& specified_output_shape =
       specified_output_shape_array.GetBuffer<ArrayDataType::kInt32>().data;
+  *(output_array.mutable_shape()->mutable_dims()) = specified_output_shape;
 }
 
 void ProcessDepthwiseConvOperator(Model* model, DepthwiseConvOperator* op) {
@@ -1179,6 +1155,11 @@ void ProcessRankOperator(Model* model, RankOperator* op) {
     return;
   }
 
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return;
+  }
+
   const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until input dims have been resolved.
@@ -1200,6 +1181,11 @@ void ProcessShapeOperator(Model* model, TensorFlowShapeOperator* op) {
     return;
   }
 
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return;
+  }
+
   const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until input dims have been resolved.
@@ -1230,10 +1216,6 @@ void ProcessStackOperator(Model* model, StackOperator* op) {
     }
 
     Shape shape = input_array.shape();
-    if (shape.dimensions_count() == 0) {
-      // Convert 0D scalars to 1D scalars of shape {1}.
-      shape.mutable_dims()->push_back(1);
-    }
     if (!stacked_shape) {
       stacked_shape.reset(new Shape(shape));
     } else {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
index 5e779f6765..6e78653fad 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
@@ -233,7 +233,12 @@ bool ResolveConstantBinaryOperator::Run(Model* model, std::size_t op_index) {
   }
 
   // Check that input data types agree.
-  CHECK(input0_array.data_type == input1_array.data_type);
+  CHECK(input0_array.data_type == input1_array.data_type)
+      << "Dissimilar data types given to op outputting \""
+      << binary_op->outputs[0] << "\". 0:\"" << binary_op->inputs[0] << "\"("
+      << static_cast<int>(input0_array.data_type) << ")   1:\""
+      << binary_op->inputs[1] << "\"("
+      << static_cast<int>(input1_array.data_type) << ").";
 
   // Do the actual constants propagation
   EvaluateBinaryOperatorOnConstantInputs(model, binary_op);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc
index 37beb41dfc..4bb1217828 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc
@@ -60,6 +60,11 @@ bool ResolveMultiplyByZero::Run(Model* model, std::size_t op_index) {
   const auto& output_array_name = mul_op->outputs[0];
   auto& output_array = model->GetArray(output_array_name);
 
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return false;
+  }
+
   // Yield if the output shape is not known yet.
   if (!output_array.has_shape()) {
     return false;
-- 
GitLab


From 5db49b64f244b89870aff89a13309796ae060620 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 14:05:40 -0700
Subject: [PATCH 317/450] [XLA] Add xla_builder and xla_computation to every
 test targets that will be migrated.

PiperOrigin-RevId: 193981015
---
 tensorflow/compiler/xla/tests/BUILD | 89 +++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)

diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 1f90a44d8b..25bbde1677 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -153,6 +153,8 @@ tf_cc_binary(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
@@ -191,6 +193,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:interpreter_plugin",  # reference backend
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -288,6 +291,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -311,6 +316,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -330,6 +337,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -371,6 +380,8 @@ xla_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -390,6 +401,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -442,6 +454,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -461,6 +475,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -478,6 +494,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -514,6 +532,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -535,6 +555,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -554,6 +576,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -578,6 +602,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -604,6 +630,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -670,6 +697,8 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -715,6 +744,8 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -738,6 +769,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -760,6 +793,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -813,6 +848,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -836,6 +873,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -898,6 +937,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -923,6 +964,8 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -963,6 +1006,8 @@ xla_test(
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1038,6 +1083,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1196,6 +1243,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1235,6 +1284,8 @@ xla_test(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1256,6 +1307,8 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1294,6 +1347,8 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1310,6 +1365,8 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1335,6 +1392,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -1355,6 +1414,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -1428,6 +1489,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1472,6 +1535,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1514,6 +1579,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -1532,6 +1599,8 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1595,6 +1664,8 @@ xla_test(
         ":client_library_test_base",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -1608,6 +1679,8 @@ xla_test(
         ":client_library_test_base",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -1629,6 +1702,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:session_proto",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1713,6 +1788,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/service:platform_util",
@@ -1740,6 +1817,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/service:platform_util",
@@ -1777,6 +1856,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1802,6 +1883,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
@@ -1860,6 +1943,8 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1886,6 +1971,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1982,6 +2069,8 @@ xla_test(
         ":test_utils",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
-- 
GitLab


From 01bc05347f430039c8efec10131b795178c9e302 Mon Sep 17 00:00:00 2001
From: Igor Saprykin <isaprykin@google.com>
Date: Mon, 23 Apr 2018 14:20:49 -0700
Subject: [PATCH 318/450] Run the canned estimator test on 2 GPUs as well.

PiperOrigin-RevId: 193983700
---
 .../contrib/distribute/python/estimator_integration_test.py    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py
index c5a520ab5a..34410a6470 100644
--- a/tensorflow/contrib/distribute/python/estimator_integration_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py
@@ -61,7 +61,8 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
           mode=['graph'],
           distribution=[
               combinations.one_device_strategy,
-              combinations.mirrored_strategy_with_gpu_and_cpu
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.mirrored_strategy_with_two_gpus
           ]))
   def test_complete_flow_with_mode(self, distribution):
     label_dimension = 2
-- 
GitLab


From d3b60b2210521a71961f675cb69bbe148b21b8da Mon Sep 17 00:00:00 2001
From: Yifei Feng <1192265+yifeif@users.noreply.github.com>
Date: Mon, 23 Apr 2018 14:24:11 -0700
Subject: [PATCH 319/450] Reapply #18446.

---
 tensorflow/python/framework/test_util.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index f954b9d6c7..5a8bc43727 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -1014,6 +1014,8 @@ class TensorFlowTestCase(googletest.TestCase):
       config.graph_options.optimizer_options.opt_level = -1
       config.graph_options.rewrite_options.constant_folding = (
           rewriter_config_pb2.RewriterConfig.OFF)
+      config.graph_options.rewrite_options.arithmetic_optimization = (
+          rewriter_config_pb2.RewriterConfig.OFF)
       return config
 
     if graph is None:
-- 
GitLab


From 1d54aeb8e1f89ac0d13eacca1eac863476f4ee0a Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Mon, 23 Apr 2018 14:23:11 -0700
Subject: [PATCH 320/450] Simplified shape inference for queues

PiperOrigin-RevId: 193984176
---
 .../core/grappler/costs/graph_properties.cc      | 16 ++++------------
 .../core/grappler/costs/graph_properties.h       |  2 +-
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index a0125ce342..ca30ad83a0 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -1080,7 +1080,7 @@ Status GraphProperties::PropagateShapes(
       // fanout of the queues, we need to manually propagate the shapes from
       // enqueue node to the corresponding queue.
       TF_RETURN_IF_ERROR(UpdateResource(resource.first, resource.second,
-                                        shape_refiner, relax, new_shapes));
+                                        shape_refiner, new_shapes));
     }
   } while (!new_shapes->empty() &&
            num_resource_iterations++ < max_resource_iterations);
@@ -1094,7 +1094,7 @@ Status GraphProperties::PropagateShapes(
 
 Status GraphProperties::UpdateResource(
     const Node* qnode, const std::unordered_set<const Node*>& queue_inputs,
-    SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes) {
+    SymbolicShapeRefiner* shape_refiner, TopoQueue* new_shapes) {
   // Proceed only if qnode is a queue or an Enter with queue input.
   if (!IsQueue(*qnode) && !IsEnterWithQueue(*qnode)) {
     return Status::OK();
@@ -1108,9 +1108,6 @@ Status GraphProperties::UpdateResource(
   // Merge all inputs into the enqueue node, regardless of which phase we
   // are in.
   std::vector<ShapeAndType> queue_shapes_and_types;
-  if (queue_handle_data) {
-    queue_shapes_and_types = *queue_handle_data;
-  }
   for (const auto& node : queue_inputs) {
     auto ctx = shape_refiner->GetContext(node);
     if (!ctx) {
@@ -1126,13 +1123,8 @@ Status GraphProperties::UpdateResource(
       if (queue_shapes_and_types.empty()) {
         queue_shapes_and_types = shapes_and_types;
       } else {
-        if (relax) {
-          TF_RETURN_IF_ERROR(RelaxEnqueueShapesAndMergeTypes(
-              shape_refiner, qnode, shapes_and_types, &queue_shapes_and_types));
-        } else {
-          TF_RETURN_IF_ERROR(MergeEnqueueShapesAndTypes(
-              shape_refiner, qnode, shapes_and_types, &queue_shapes_and_types));
-        }
+        TF_RETURN_IF_ERROR(RelaxEnqueueShapesAndMergeTypes(
+            shape_refiner, qnode, shapes_and_types, &queue_shapes_and_types));
       }
     }
   }
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index 4c3f3f5f53..a4e3031db1 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -93,7 +93,7 @@ class GraphProperties {
   // enqueue its fanout in 'new_shapes'.
   static Status UpdateResource(
       const Node* qnode, const std::unordered_set<const Node*>& queue_inputs,
-      SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes);
+      SymbolicShapeRefiner* shape_refiner, TopoQueue* new_shapes);
 
   // Update the output shapes of a Merge node, and enqueue its fanout in
   // new_shapes if needed.
-- 
GitLab


From d12244894aa0cdd068b46ebed407ced1915272b2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 14:39:53 -0700
Subject: [PATCH 321/450] Use %zu instead of %lu since size_t is not an
 unsigned long on 32-bit.

PiperOrigin-RevId: 193987261
---
 tensorflow/contrib/lite/optional_debug_tools.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/optional_debug_tools.cc b/tensorflow/contrib/lite/optional_debug_tools.cc
index e0a0910117..dfdd80ea8a 100644
--- a/tensorflow/contrib/lite/optional_debug_tools.cc
+++ b/tensorflow/contrib/lite/optional_debug_tools.cc
@@ -72,7 +72,7 @@ const char* AllocTypeName(TfLiteAllocationType type) {
 
 // Prints a dump of what tensors and what nodes are in the interpreter.
 void PrintInterpreterState(Interpreter* interpreter) {
-  printf("Interpreter has %lu tensors and %lu nodes\n",
+  printf("Interpreter has %zu tensors and %zu nodes\n",
          interpreter->tensors_size(), interpreter->nodes_size());
   printf("Inputs:");
   PrintIntVector(interpreter->inputs());
-- 
GitLab


From f97fec3cf5d361103d21989b78a74dd1820620d8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 14:58:58 -0700
Subject: [PATCH 322/450] Refactoring triangular_solve.cc to use the new common
 utility functions.

PiperOrigin-RevId: 193990473
---
 .../compiler/tf2xla/lib/triangular_solve.cc   | 82 ++++++-------------
 1 file changed, 25 insertions(+), 57 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
index 7f72a6073d..9bf5821b54 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
@@ -83,15 +83,6 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
         block_size);
   }
 
-  // Returns [b1, b2, ... , bn, indices[0], indices[1]].
-  auto prepend_batch_dims = [&](std::array<int64, 2> indices) {
-    std::vector<int64> output(ndims);
-    std::copy(batch_dimensions.begin(), batch_dimensions.end(), output.begin());
-    std::copy(indices.begin(), indices.end(),
-              output.begin() + batch_dimensions.size());
-    return output;
-  };
-
   // Applies a complex conjugation operation if `a` is complex and `conjugate_a`
   // is true, otherwise returns its argument.
   auto maybe_conj = [&](xla::ComputationBuilder* builder,
@@ -108,11 +99,12 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
       std::unique_ptr<xla::ComputationBuilder> sub = builder->CreateSubBuilder(
           tensorflow::strings::StrCat("trsm_base_", k));
 
-      auto a_param =
-          sub->Parameter(0,
-                         xla::ShapeUtil::MakeShape(b_shape->element_type(),
-                                                   prepend_batch_dims({k, k})),
-                         "a");
+      auto a_param = sub->Parameter(
+          0,
+          xla::ShapeUtil::MakeShape(
+              b_shape->element_type(),
+              PrependMajorDims(sub.get(), batch_dimensions, {k, k})),
+          "a");
 
       std::array<int64, 2> b_lastd;
       if (left_side) {
@@ -120,11 +112,12 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
       } else {
         b_lastd = {m, k};
       }
-      auto b_param =
-          sub->Parameter(1,
-                         xla::ShapeUtil::MakeShape(b_shape->element_type(),
-                                                   prepend_batch_dims(b_lastd)),
-                         "b");
+      auto b_param = sub->Parameter(
+          1,
+          xla::ShapeUtil::MakeShape(
+              b_shape->element_type(),
+              PrependMajorDims(sub.get(), batch_dimensions, b_lastd)),
+          "b");
 
       // We use a left-looking subroutine on the block diagonal in some common
       // cases, while falling back to a recursive call in unsupported cases. The
@@ -380,14 +373,6 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
     batch_dimensions.push_back(a_size);
   }
 
-  auto prepend_batch_dims = [&](std::array<int64, 2> indices) {
-    std::vector<int64> output(ndims);
-    std::copy(batch_dimensions.begin(), batch_dimensions.end(), output.begin());
-    std::copy(indices.begin(), indices.end(),
-              output.begin() + batch_dimensions.size());
-    return output;
-  };
-
   auto maybe_conj = [&](xla::ComputationBuilder* builder,
                         xla::ComputationDataHandle x) {
     auto perform_conj = a_shape->element_type() == xla::C64 && conjugate_a;
@@ -479,30 +464,6 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
     auto body_b = bodyb->GetTupleElement(input_tuple, 3);
     auto zero = bodyb->ConstantR0<int32>(0);
 
-    // Set up some helper functions.
-    auto prepend_zeros = [&](std::array<xla::ComputationDataHandle, 2> starts) {
-      auto zero = bodyb->Reshape(bodyb->ConstantR0<int32>(0), {1});
-      std::vector<xla::ComputationDataHandle> padded_starts(ndims, zero);
-      padded_starts[ndims - 2] = bodyb->Reshape(starts[0], {1});
-      padded_starts[ndims - 1] = bodyb->Reshape(starts[1], {1});
-      return bodyb->ConcatInDim(padded_starts, 0);
-    };
-
-    auto dynamic_slice = [&](xla::ComputationDataHandle x,
-                             std::array<xla::ComputationDataHandle, 2> starts,
-                             std::array<int64, 2> sizes) {
-      auto padded_starts = prepend_zeros(starts);
-      auto padded_sizes = prepend_batch_dims(sizes);
-      return bodyb->DynamicSlice(x, padded_starts, padded_sizes);
-    };
-
-    auto update = [&](xla::ComputationDataHandle x,
-                      xla::ComputationDataHandle update,
-                      std::array<xla::ComputationDataHandle, 2> starts) {
-      auto padded_starts = prepend_zeros(starts);
-      return bodyb->DynamicUpdateSlice(x, update, padded_starts);
-    };
-
     // We'd like to implement this:
     //   if transpose_a:
     //     a_row = T(a[..., i+1:, i:i+1])
@@ -516,22 +477,29 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
     // all zeros and use that as zero-padding (doing unnecessary FLOPs).
     xla::ComputationDataHandle a_row;
     if (transpose_a) {
-      a_row = dynamic_slice(body_a, {zero, i}, {m, 1});
+      TF_ASSIGN_OR_RETURN(a_row, DynamicSliceInMinorDims(bodyb.get(), body_a,
+                                                         {zero, i}, {m, 1}));
     } else {
-      a_row = dynamic_slice(body_a, {i, zero}, {1, m});
+      TF_ASSIGN_OR_RETURN(a_row, DynamicSliceInMinorDims(bodyb.get(), body_a,
+                                                         {i, zero}, {1, m}));
     }
     TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(bodyb.get(), a_row, body_out,
                                                 /*transpose_x=*/transpose_a,
                                                 /*transpose_y=*/false,
                                                 /*conjugate_x=*/conjugate_a,
                                                 /*conjugate_y=*/false));
-    auto result_row =
-        bodyb->Sub(dynamic_slice(body_b, {i, zero}, {1, n}), b_update);
+    TF_ASSIGN_OR_RETURN(
+        auto result_row_slice,
+        DynamicSliceInMinorDims(bodyb.get(), body_b, {i, zero}, {1, n}));
+    auto result_row = bodyb->Sub(result_row_slice, b_update);
 
     // body_out[..., i:i+1, :] = result_row / a[..., i:i+1, i:i+1]
-    auto a_elt = dynamic_slice(body_a, {i, i}, {1, 1});
+    TF_ASSIGN_OR_RETURN(auto a_elt, DynamicSliceInMinorDims(bodyb.get(), body_a,
+                                                            {i, i}, {1, 1}));
     auto div_result = bodyb->Div(result_row, maybe_conj(bodyb.get(), a_elt));
-    body_out = update(body_out, div_result, {i, zero});
+    TF_ASSIGN_OR_RETURN(body_out,
+                        DynamicUpdateSliceInMinorDims(bodyb.get(), body_out,
+                                                      div_result, {i, zero}));
 
     // if transpose_a:
     //   return (i - 1, body_out, a, b)
-- 
GitLab


From 6f6c75a7673cd73dfbaaba3f259ce9ab5c8086a1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 15:00:43 -0700
Subject: [PATCH 323/450] [XLA] Redesign: migrate xla/tests/a*, xla/tests/b*.

PiperOrigin-RevId: 193990756
---
 .../xla/tests/array_elementwise_ops_test.cc   | 27 +++---
 .../compiler/xla/tests/axpy_simple_test.cc    |  5 +-
 .../tests/bad_rng_shape_validation_test.cc    | 12 +--
 .../compiler/xla/tests/bfloat16_test.cc       | 13 ++-
 .../compiler/xla/tests/binop_scaling_test.cc  | 14 ++--
 .../xla/tests/broadcast_simple_test.cc        | 82 +++++++++----------
 .../xla/tests/client_library_test_base.cc     |  8 ++
 .../xla/tests/client_library_test_base.h      |  3 +
 8 files changed, 84 insertions(+), 80 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 4b4dc6dd9d..e8a5efe796 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
@@ -214,7 +213,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantZeroElementC64s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantU64s) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<uint64> lhs{0xFFFFFFFF,
                           static_cast<uint64>(-1),
@@ -255,7 +254,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantU64s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantS64s) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<int64> lhs{static_cast<int64>(0x8000000000000000LL),
                          static_cast<int64>(0x8000000000000000LL),
@@ -1332,7 +1331,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowZeroElementF32s) {
 
 // Some Pow cases that can be implemented more efficiently.
 XLA_TEST_F(ArrayElementwiseOpTest, PowSpecialF32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<float> values = {1.0f, 2.0f, 3.2f, -4.0f};
   std::vector<float> exponents = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1360,7 +1359,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowSpecialF32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, PowOfExpF32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f};
   std::vector<float> values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1385,7 +1384,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowOfExpF32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, LogOfPowerF32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, 4.0f, 0.5f, 5.7f};
   std::vector<float> values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1410,7 +1409,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, LogOfPowerF32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MulOfExpF32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f};
   std::vector<float> values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1435,7 +1434,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MulOfExpF32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, DivOfExpF32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f};
   std::vector<float> values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1460,7 +1459,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivOfExpF32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Div3_lhs_F32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f};
   std::vector<float> values1 = {0.1f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1492,7 +1491,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Div3_lhs_F32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Div3_rhs_F32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f};
   std::vector<float> values1 = {0.1f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1525,7 +1524,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Div3_rhs_F32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, DivOfPowerF32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f};
   std::vector<float> values1 = {0.1f, 1.0f, 2.0f, 0.5f, 1.0f, 0.5f};
@@ -1558,7 +1557,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivOfPowerF32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Div4F32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f};
   std::vector<float> values1 = {0.1f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -2357,7 +2356,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Eq) {
   // Test broadcasting in Eq comparison.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<int32>({42, 73});
   auto m = builder.ConstantR2<int32>({{42, 73}, {42, 52}});
 
@@ -2783,7 +2782,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, NonIdentityBroadcastOfSameRankIsDisallowed) {
 // Regression test for b/31927799. "slice - y" is fused and requires implicit
 // broadcast.
 XLA_TEST_F(ArrayElementwiseOpTest, ImplictBroadcastInFusedExpressions) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x_literal = Literal::CreateR1<float>({1, 2, 3});
   auto y_literal = Literal::CreateR1<float>({4, 5});
   auto x_data = client_->TransferToServer(*x_literal).ConsumeValueOrDie();
diff --git a/tensorflow/compiler/xla/tests/axpy_simple_test.cc b/tensorflow/compiler/xla/tests/axpy_simple_test.cc
index ec3b46acfe..fcd9ff55e3 100644
--- a/tensorflow/compiler/xla/tests/axpy_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/axpy_simple_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -42,7 +41,7 @@ TEST_F(AxpySimpleTest, AxTenValues) {
 }
 
 XLA_TEST_F(AxpySimpleTest, AxpyZeroValues) {
-  ComputationBuilder builder(client_, "axpy_10");
+  XlaBuilder builder("axpy_10");
   auto alpha = builder.ConstantR0<float>(3.1415926535);
   auto x = builder.ConstantR1<float>({});
   auto y = builder.ConstantR1<float>({});
@@ -54,7 +53,7 @@ XLA_TEST_F(AxpySimpleTest, AxpyZeroValues) {
 }
 
 TEST_F(AxpySimpleTest, AxpyTenValues) {
-  ComputationBuilder builder(client_, "axpy_10");
+  XlaBuilder builder("axpy_10");
   auto alpha = builder.ConstantR0<float>(3.1415926535);
   auto x = builder.ConstantR1<float>(
       {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0});
diff --git a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
index e4bf1827ac..22c3394e6f 100644
--- a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
+++ b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -34,13 +34,13 @@ namespace {
 class BadRngShapeValidationTest : public ClientLibraryTestBase {};
 
 TEST_F(BadRngShapeValidationTest, DefaultConstructedShapeCreatesError) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto zero = builder.ConstantR0<float>(0.0);
   auto one = builder.ConstantR0<float>(1.0);
   Shape default_constructed;
   builder.RngUniform(zero, one, default_constructed);
 
-  StatusOr<Computation> computation = builder.Build();
+  StatusOr<XlaComputation> computation = builder.Build();
   EXPECT_FALSE(computation.ok());
   LOG(INFO) << "status received: " << computation.status();
   EXPECT_THAT(computation.status().error_message(),
@@ -48,7 +48,7 @@ TEST_F(BadRngShapeValidationTest, DefaultConstructedShapeCreatesError) {
 }
 
 TEST_F(BadRngShapeValidationTest, ShapeWithoutLayoutIsOk) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto zero = builder.ConstantR0<float>(0.0);
   auto one = builder.ConstantR0<float>(1.0);
   Shape sans_layout;
@@ -57,7 +57,7 @@ TEST_F(BadRngShapeValidationTest, ShapeWithoutLayoutIsOk) {
 
   builder.RngUniform(zero, one, sans_layout);
 
-  StatusOr<Computation> computation = builder.Build();
+  StatusOr<XlaComputation> computation = builder.Build();
   ASSERT_TRUE(computation.ok());
   LOG(INFO) << computation.status();
 }
diff --git a/tensorflow/compiler/xla/tests/bfloat16_test.cc b/tensorflow/compiler/xla/tests/bfloat16_test.cc
index b853dfaa15..4e65cf11f3 100644
--- a/tensorflow/compiler/xla/tests/bfloat16_test.cc
+++ b/tensorflow/compiler/xla/tests/bfloat16_test.cc
@@ -19,10 +19,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -52,7 +51,7 @@ class Bfloat16Test : public ClientLibraryTestBase {
 };
 
 XLA_TEST_F(Bfloat16Test, ScalarOperation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR0<bfloat16>(static_cast<bfloat16>(2.0f));
   auto y = builder.ConstantR0<bfloat16>(static_cast<bfloat16>(1.0f));
   builder.Add(x, y);
@@ -62,7 +61,7 @@ XLA_TEST_F(Bfloat16Test, ScalarOperation) {
 }
 
 XLA_TEST_F(Bfloat16Test, LogOperation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR0<bfloat16>(static_cast<bfloat16>(4.0f));
   builder.Log(x);
 
@@ -71,7 +70,7 @@ XLA_TEST_F(Bfloat16Test, LogOperation) {
 }
 
 XLA_TEST_F(Bfloat16Test, NegateScalarF16) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Neg(builder.ConstantR0<bfloat16>(static_cast<bfloat16>(2.1f)));
 
   ComputeAndCompareR0<bfloat16>(&builder, static_cast<bfloat16>(-2.1f), {},
@@ -80,7 +79,7 @@ XLA_TEST_F(Bfloat16Test, NegateScalarF16) {
 
 XLA_TEST_F(Bfloat16Test, BatchNormTraining) {
   const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto operand = builder.ConstantR4FromArray4D<bfloat16>(
       {{{{static_cast<bfloat16>(1.f)}, {static_cast<bfloat16>(2.f)}},
@@ -117,7 +116,7 @@ XLA_TEST_F(Bfloat16Test, BatchNormTraining) {
 
 XLA_TEST_F(Bfloat16Test, BatchNormGrad) {
   const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto operand = builder.ConstantR4FromArray4D<bfloat16>(
       Array4D<bfloat16>(2, 2, 2, 1, static_cast<bfloat16>(0.0f)));
diff --git a/tensorflow/compiler/xla/tests/binop_scaling_test.cc b/tensorflow/compiler/xla/tests/binop_scaling_test.cc
index 97fec89b63..48203b1d40 100644
--- a/tensorflow/compiler/xla/tests/binop_scaling_test.cc
+++ b/tensorflow/compiler/xla/tests/binop_scaling_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -32,7 +32,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixRowVector_32x4) {
   auto alhs = MakeLinspaceArray2D(0.0, 1.0, 32, 4);
   auto arhs = MakeLinspaceArray2D(0.0, 1.0, 1, 4);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR2FromArray2D<float>(*alhs);
   auto rhs = builder.ConstantR2FromArray2D<float>(*arhs);
   builder.Add(lhs, rhs);
@@ -48,7 +48,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixRowVector_129x129) {
   auto alhs = MakeLinspaceArray2D(0.0, 1.0, 129, 129);
   auto arhs = MakeLinspaceArray2D(0.0, 1.0, 1, 129);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR2FromArray2D<float>(*alhs);
   auto rhs = builder.ConstantR2FromArray2D<float>(*arhs);
   builder.Add(lhs, rhs);
@@ -64,7 +64,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixColVector_9x5) {
   auto alhs = MakeLinspaceArray2D(0.0, 1.0, 9, 5);
   auto arhs = MakeLinspaceArray2D(0.0, 1.0, 9, 1);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR2FromArray2D<float>(*alhs);
   auto rhs = builder.ConstantR2FromArray2D<float>(*arhs);
   builder.Add(lhs, rhs);
@@ -80,7 +80,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixColVector_129x257) {
   auto alhs = MakeLinspaceArray2D(0.0, 1.0, 129, 257);
   auto arhs = MakeLinspaceArray2D(0.0, 1.0, 129, 1);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR2FromArray2D<float>(*alhs);
   auto rhs = builder.ConstantR2FromArray2D<float>(*arhs);
   builder.Add(lhs, rhs);
@@ -93,7 +93,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixColVector_129x257) {
 }
 
 TEST_F(BinopScalingTest, R0PlusR2F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR0<float>(42.0);
   auto rhs = builder.ConstantR2<float>({
       {1.0, 2.0}, {3.0, 4.0},
@@ -109,7 +109,7 @@ TEST_F(BinopScalingTest, R0PlusR2F32) {
 }
 
 TEST_F(BinopScalingTest, R4PlusR0S32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // clang-format off
   Array4D<int> lhs_array({
     {{{1, 2},
diff --git a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
index 97095f1cc4..34c86e007b 100644
--- a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -33,10 +33,8 @@ namespace {
 
 class BroadcastSimpleTest : public ClientLibraryTestBase {
  public:
-  ComputationDataHandle BuildBinOp(HloOpcode op,
-                                   const ComputationDataHandle& lhs,
-                                   const ComputationDataHandle& rhs,
-                                   ComputationBuilder* builder) {
+  XlaOp BuildBinOp(HloOpcode op, const XlaOp& lhs, const XlaOp& rhs,
+                   XlaBuilder* builder) {
     switch (op) {
       case HloOpcode::kMinimum: {
         return builder->Min(lhs, rhs);
@@ -105,21 +103,21 @@ class BroadcastSimpleTest : public ClientLibraryTestBase {
 using ::testing::HasSubstr;
 
 XLA_TEST_F(BroadcastSimpleTest, ScalarNoOpBroadcast) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR0<float>(1.5), {});
   ComputeAndCompareR0<float>(&b, 1.5, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_2x3) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR0<float>(2.25), {2, 3});
   Array2D<float> expected(2, 3, 2.25);
   ComputeAndCompareR2<float>(&b, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, ScalarParamTo2D_2x3) {
-  ComputationBuilder b(client_, TestName());
-  ComputationDataHandle src;
+  XlaBuilder b(TestName());
+  XlaOp src;
   std::unique_ptr<GlobalData> param_data =
       CreateR0Parameter<float>(2.25f, /*parameter_number=*/0, /*name=*/"src",
                                /*builder=*/&b, /*data_handle=*/&src);
@@ -131,21 +129,21 @@ XLA_TEST_F(BroadcastSimpleTest, ScalarParamTo2D_2x3) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_2x0) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR0<float>(2.25), {2, 0});
   Array2D<float> expected(2, 0);
   ComputeAndCompareR2<float>(&b, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_0x2) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR0<float>(2.25), {0, 2});
   Array2D<float> expected(0, 2);
   ComputeAndCompareR2<float>(&b, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, 1DTo2D) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR1<float>({1, 2, 3}), {2});
 
   Array2D<float> expected(2, 3);
@@ -160,7 +158,7 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D) {
 
 // Tests implicit broadcasting of PREDs.
 XLA_TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   Array2D<bool> x_vals(2, 1);
   x_vals(0, 0) = true;
@@ -171,7 +169,7 @@ XLA_TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) {
   y_vals(1, 0, 0) = true;
   y_vals(1, 1, 0) = true;
 
-  ComputationDataHandle x, y;
+  XlaOp x, y;
   auto x_data = CreateR2Parameter<bool>(x_vals, 0, "x", &b, &x);
   auto y_data = CreateR3Parameter<bool>(y_vals, 1, "y", &b, &y);
   b.And(x, y, /*broadcast_dimensions=*/{1, 2});
@@ -186,7 +184,7 @@ XLA_TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, ZeroElement_1DTo2D) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR1<float>({}), {2});
 
   Array2D<float> expected(2, 0);
@@ -194,7 +192,7 @@ XLA_TEST_F(BroadcastSimpleTest, ZeroElement_1DTo2D) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, 1DToZeroElement2D) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR1<float>({1, 2, 3}), {0});
 
   Array2D<float> expected(0, 3);
@@ -209,7 +207,7 @@ XLA_TEST_F(BroadcastSimpleTest, InDimensionAndDegenerateBroadcasting) {
   // broadcasting (broadcast_dimensions {1, 2}), then is added to the rhs shape
   // [2, 3, 1]. Degenerate dimension broadcasting then broadcasts the size one
   // dimensions.
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   b.Add(b.ConstantR2<float>({{1.0, 5.0}}),
         b.ConstantLiteral(*Literal::CreateR3<float>(
@@ -247,7 +245,7 @@ class BroadcastR3ImplicitTest
 
 XLA_TEST_P(BroadcastR3ImplicitTest, Doit) {
   const R3ImplicitBroadcastSpec& spec = GetParam();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Shape r3_shape, r3_implicit_shape;
   Array3D<float> r3_array(spec.output_bounds[0], spec.output_bounds[1],
@@ -264,8 +262,7 @@ XLA_TEST_P(BroadcastR3ImplicitTest, Doit) {
 
   auto r3_implicit_parameter = builder.Parameter(0, r3_implicit_shape, "input");
   auto r3_parameter = builder.Parameter(1, r3_shape, "input");
-  ComputationDataHandle op =
-      BuildBinOp(spec.op, r3_implicit_parameter, r3_parameter, &builder);
+  XlaOp op = BuildBinOp(spec.op, r3_implicit_parameter, r3_parameter, &builder);
 
   Array3D<float> expected_array(spec.output_bounds[0], spec.output_bounds[1],
                                 spec.output_bounds[2]);
@@ -300,9 +297,9 @@ INSTANTIATE_TEST_CASE_P(BroadcastR3ImplicitTestInstances,
 
 // r1 and r3's dim0 matches, and r1's dim1 and dim2 have size 1:
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1_2) {
-  ComputationBuilder b(client_, TestName());
-  ComputationDataHandle r1h;
-  ComputationDataHandle r3h;
+  XlaBuilder b(TestName());
+  XlaOp r1h;
+  XlaOp r3h;
 
   Array3D<float> r1d = {{{1}}, {{2}}};
   Array3D<float> r3d = {{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}};
@@ -319,7 +316,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1_2) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR3<float>({{{1, 2}}}));
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -332,7 +329,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_2) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR3<float>({{{1}, {2}}}));
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -345,7 +342,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_2) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR3<float>({{{1, 2}, {3, 4}}}));
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -358,7 +355,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR3<float>({{{1, 2}}, {{3, 4}}}));
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -371,7 +368,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_2) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 =
       b.ConstantLiteral(*Literal::CreateR3<float>({{{1}, {2}}, {{3}, {4}}}));
   auto r3 = b.ConstantLiteral(
@@ -385,7 +382,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_2) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1_2) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR3<float>({{{1}}}));
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -491,7 +488,7 @@ class BroadcastR2ImplicitTest
 XLA_TEST_P(BroadcastR2ImplicitTest, Doit) {
   const R2ImplicitBroadcastSpec& spec = GetParam();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   // Operands with degenerate dimensions require implicit broadcasting:
   Shape r2_shape, r2_implicit_shape1, r2_implicit_shape2;
@@ -517,10 +514,9 @@ XLA_TEST_P(BroadcastR2ImplicitTest, Doit) {
   auto r2_implicit_parameter2 =
       builder.Parameter(2, r2_implicit_shape2, "input2");
 
-  ComputationDataHandle op1 =
+  XlaOp op1 =
       BuildBinOp(spec.op1, r2_implicit_parameter1, r2_parameter, &builder);
-  ComputationDataHandle op2 =
-      BuildBinOp(spec.op2, op1, r2_implicit_parameter2, &builder);
+  XlaOp op2 = BuildBinOp(spec.op2, op1, r2_implicit_parameter2, &builder);
 
   Array2D<float> expected_array(spec.output_bounds[0], spec.output_bounds[1]);
 
@@ -547,7 +543,7 @@ INSTANTIATE_TEST_CASE_P(BroadcastR2ImplicitTestInstances,
                         ::testing::ValuesIn(kR2ImplicitBroadcastTestCases));
 
 XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_0) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR2<float>({{1, 2}}));
   auto r2 = b.ConstantLiteral(*Literal::CreateR2<float>({{1, 2}, {3, 4}}));
   b.Add(r2, r1);
@@ -558,7 +554,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_0) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_1) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR2<float>({{1}, {2}}));
   auto r2 = b.ConstantLiteral(*Literal::CreateR2<float>({{1, 2}, {3, 4}}));
   b.Add(r2, r1);
@@ -569,7 +565,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_1) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim0) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantR1<float>({10, 20});
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -582,7 +578,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim0) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim1) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantR1<float>({10, 20});
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -595,7 +591,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim1) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim2) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantR1<float>({10, 20});
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -608,7 +604,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim2) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAll) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1_0 = b.ConstantR1<float>({1000, 2000});
   auto r1_1 = b.ConstantR1<float>({100, 200});
   auto r1_2 = b.ConstantR1<float>({10, 20});
@@ -629,7 +625,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAll) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAllWithScalarBroadcast) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1_0 = b.ConstantR1<float>({1000, 2000});
   auto r1_1 = b.ConstantR1<float>({100, 200});
   auto r1_2 = b.ConstantR1<float>({10, 20});
@@ -652,7 +648,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAllWithScalarBroadcast) {
 XLA_TEST_F(BroadcastSimpleTest, InvalidBinaryAndDegenerateBroadcasting) {
   // Binary dimension broadcasting of the smaller lhs ([2, 2] up to [2, 2, 2])
   // results in a shape incompatible with the lhs [2, 3, 1].
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   b.Add(b.ConstantR2<float>({{1.0, 5.0}, {1.0, 5.0}}),
         b.ConstantLiteral(*Literal::CreateR3<float>(
@@ -667,7 +663,7 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidBinaryAndDegenerateBroadcasting) {
 
 XLA_TEST_F(BroadcastSimpleTest, InvalidInDimensionBroadcasting) {
   // Test invalid broadcasting with [1, 2] and [2, 3] inputs.
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   b.Add(b.ConstantR2<float>({{1.0, 2.0}}),
         b.ConstantR2<float>({{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}));
@@ -680,7 +676,7 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidInDimensionBroadcasting) {
 
 XLA_TEST_F(BroadcastSimpleTest, InvalidDegenerateBroadcasting) {
   // Test invalid broadcasting with [1, 2] and [2, 3] inputs.
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   b.Add(b.ConstantR2<float>({{1.0, 2.0}}),
         b.ConstantR2<float>({{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}));
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index 69389dae3f..31c9e21644 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -211,6 +211,14 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
                                                   arguments);
 }
 
+void ClientLibraryTestBase::ComputeAndCompareR1(
+    XlaBuilder* builder, const tensorflow::core::Bitmap& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
+  std::unique_ptr<Literal> expected_literal = Literal::CreateR1(expected);
+  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal,
+                                                  arguments);
+}
+
 template <typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareLiteral(
     BuilderT* builder, const Literal& expected,
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 481d7c5c25..85ebe29ae9 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -165,6 +165,9 @@ class ClientLibraryTestBase : public ::testing::Test {
   void ComputeAndCompareR1(ComputationBuilder* builder,
                            const tensorflow::core::Bitmap& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+  void ComputeAndCompareR1(XlaBuilder* builder,
+                           const tensorflow::core::Bitmap& expected,
+                           tensorflow::gtl::ArraySlice<GlobalData*> arguments);
 
   template <typename NativeT, typename BuilderT>
   void ComputeAndCompareR2(BuilderT* builder, const Array2D<NativeT>& expected,
-- 
GitLab


From 9e1d93d28fe30171de3f6838028eeadb44b0d6fd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 15:15:25 -0700
Subject: [PATCH 324/450] Changing tf.foldl and tf.foldr to accept
 multiple/nested tensors as element/initializer.

PiperOrigin-RevId: 193993295
---
 .../kernel_tests/functional_ops_test.py       |  40 +++++++
 tensorflow/python/ops/functional_ops.py       | 100 ++++++++++++------
 2 files changed, 110 insertions(+), 30 deletions(-)

diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 34fb655035..5f48be94da 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -70,6 +70,26 @@ class FunctionalOpsTest(test.TestCase):
           initializer=10)
       self.assertAllEqual(880, self.evaluate(r))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testFoldl_SingleInputMultiOutput(self):
+    with self.test_session():
+      elems = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+      initializer = np.array([1, -1.0])
+      r = functional_ops.foldl(lambda a, x: a + x, elems, initializer)
+      r_value = self.evaluate(r)
+
+      self.assertAllEqual(22, r_value[0])
+      self.assertAllEqual(20, r_value[1])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testFoldl_MultiInputSingleOutput(self):
+    with self.test_session():
+      elems = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+      initializer = np.array(1.0)
+      r = functional_ops.foldl(lambda a, x: a + x[0] + x[1], (elems, -elems),
+                               initializer)
+      self.assertAllEqual(1, self.evaluate(r))
+
   def testFoldl_Scoped(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope("root") as varscope:
@@ -105,6 +125,26 @@ class FunctionalOpsTest(test.TestCase):
           initializer=10)
       self.assertAllEqual(1282, self.evaluate(r))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testFoldr_SingleInputMultiOutput(self):
+    with self.test_session():
+      elems = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+      initializer = np.array([1, -1.0])
+      r = functional_ops.foldr(lambda a, x: a + x, elems, initializer)
+      r_value = self.evaluate(r)
+
+      self.assertAllEqual(22, r_value[0])
+      self.assertAllEqual(20, r_value[1])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testFoldr_MultiInputSingleOutput(self):
+    with self.test_session():
+      elems = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+      initializer = np.array(1.0)
+      r = functional_ops.foldr(lambda a, x: a + x[0] + x[1], (elems, -elems),
+                               initializer)
+      self.assertAllEqual(1, self.evaluate(r))
+
   def testFoldr_Scoped(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope("root") as varscope:
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 161f6f3659..1b3a1e5cbc 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -65,10 +65,20 @@ def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
   Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
   of the result tensor is fn(initializer, values[0]).shape`.
 
+  This method also allows multi-arity `elems` and output of `fn`.  If `elems`
+  is a (possibly nested) list or tuple of tensors, then each of these tensors
+  must have a matching first (unpack) dimension.  The signature of `fn` may
+  match the structure of `elems`.  That is, if `elems` is
+  `(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is:
+  `fn = lambda (t1, [t2, t3, [t4, t5]]):`.
+
   Args:
     fn: The callable to be performed.
-    elems: A tensor to be unpacked on dimension 0.
-    initializer: (optional) The initial value for the accumulator.
+    elems: A tensor or (possibly nested) sequence of tensors, each of which
+      will be unpacked along their first dimension.  The nested sequence
+      of the resulting slices will be the first argument to `fn`.
+    initializer: (optional) A tensor or (possibly nested) sequence of tensors,
+      as the initial value for the accumulator.
     parallel_iterations: (optional) The number of iterations allowed to run
       in parallel.
     back_prop: (optional) True enables support for back propagation.
@@ -76,8 +86,9 @@ def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     name: (optional) Name prefix for the returned tensors.
 
   Returns:
-    A tensor resulting from applying `fn` consecutively to the list of tensors
-    unpacked from `elems`, from first to last.
+    A tensor or (possibly nested) sequence of tensors, resulting from applying
+    `fn` consecutively to the list of tensors unpacked from `elems`, from first
+    to last.
 
   Raises:
     TypeError: if `fn` is not callable.
@@ -92,6 +103,11 @@ def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
   if not callable(fn):
     raise TypeError("fn must be callable.")
 
+  def create_ta(elem):
+    return tensor_array_ops.TensorArray(
+        dtype=elem.dtype, size=n, dynamic_size=False,
+        infer_shape=True).unstack(elem)
+
   in_graph_mode = not context.executing_eagerly()
   with ops.name_scope(name, "foldl", [elems]):
     # TODO(akshayka): Remove the in_graph_mode check once caching devices are
@@ -107,24 +123,26 @@ def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
         varscope.set_caching_device(lambda op: op.device)
         varscope_caching_device_was_none = True
 
-    # Convert elems to tensor array.
-    elems = ops.convert_to_tensor(elems, name="elems")
-    n = array_ops.shape(elems)[0]
-    elems_ta = tensor_array_ops.TensorArray(dtype=elems.dtype, size=n,
-                                            dynamic_size=False,
-                                            infer_shape=True)
-    elems_ta = elems_ta.unstack(elems)
+    # Convert elems to tensor array. n may be known statically.
+    elems_flat = [
+        ops.convert_to_tensor(elem, name="elem") for elem in nest.flatten(elems)
+    ]
+    n = elems_flat[0].shape[0].value or array_ops.shape(elems_flat[0])[0]
+
+    elems_ta = nest.map_structure(create_ta, elems)
 
     if initializer is None:
-      a = elems_ta.read(0)
+      a = nest.map_structure(lambda elem: elem.read(0), elems_ta)
       i = constant_op.constant(1)
     else:
-      a = ops.convert_to_tensor(initializer)
+      a = initializer
       i = constant_op.constant(0)
 
     def compute(i, a):
-      a = fn(a, elems_ta.read(i))
+      elem_i = nest.map_structure(lambda elem: elem.read(i), elems_ta)
+      a = fn(a, elem_i)
       return [i + 1, a]
+
     _, r_a = control_flow_ops.while_loop(
         lambda i, a: i < n, compute, [i, a],
         parallel_iterations=parallel_iterations,
@@ -135,6 +153,7 @@ def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     # supported in Eager
     if in_graph_mode and varscope_caching_device_was_none:
       varscope.set_caching_device(None)
+
     return r_a
 
 
@@ -153,10 +172,20 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
   Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
   of the result tensor is `fn(initializer, values[0]).shape`.
 
+  This method also allows multi-arity `elems` and output of `fn`.  If `elems`
+  is a (possibly nested) list or tuple of tensors, then each of these tensors
+  must have a matching first (unpack) dimension.  The signature of `fn` may
+  match the structure of `elems`.  That is, if `elems` is
+  `(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is:
+  `fn = lambda (t1, [t2, t3, [t4, t5]]):`.
+
   Args:
     fn: The callable to be performed.
-    elems: A tensor that is unpacked into a sequence of tensors to apply `fn`.
-    initializer: (optional) The initial value for the accumulator.
+    elems: A tensor or (possibly nested) sequence of tensors, each of which
+      will be unpacked along their first dimension.  The nested sequence
+      of the resulting slices will be the first argument to `fn`.
+    initializer: (optional) A tensor or (possibly nested) sequence of tensors,
+      as the initial value for the accumulator.
     parallel_iterations: (optional) The number of iterations allowed to run
       in parallel.
     back_prop: (optional) True enables support for back propagation.
@@ -164,8 +193,9 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     name: (optional) Name prefix for the returned tensors.
 
   Returns:
-    A tensor resulting from applying `fn` consecutively to the list of tensors
-    unpacked from `elems`, from last to first.
+    A tensor or (possibly nested) sequence of tensors, resulting from applying
+    `fn` consecutively to the list of tensors unpacked from `elems`, from last
+    to first.
 
   Raises:
     TypeError: if `fn` is not callable.
@@ -180,6 +210,11 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
   if not callable(fn):
     raise TypeError("fn must be callable.")
 
+  def create_ta(elem):
+    return tensor_array_ops.TensorArray(
+        dtype=elem.dtype, size=n, dynamic_size=False,
+        infer_shape=True).unstack(elem)
+
   in_graph_mode = not context.executing_eagerly()
   with ops.name_scope(name, "foldr", [elems]):
     # TODO(akshayka): Remove the in_graph_mode check once caching devices are
@@ -195,26 +230,30 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
         varscope.set_caching_device(lambda op: op.device)
         varscope_caching_device_was_none = True
 
-    # Convert elems to tensor array.
-    elems = ops.convert_to_tensor(elems, name="elems")
-    n = array_ops.shape(elems)[0]
-    elems_ta = tensor_array_ops.TensorArray(dtype=elems.dtype, size=n,
-                                            dynamic_size=False,
-                                            infer_shape=True)
-    elems_ta = elems_ta.unstack(elems)
+    # Convert elems to tensor array. n may be known statically.
+    elems_flat = [
+        ops.convert_to_tensor(elem, name="elem") for elem in nest.flatten(elems)
+    ]
+    n = elems_flat[0].shape[0].value or array_ops.shape(elems_flat[0])[0]
+
+    elems_ta = nest.map_structure(create_ta, elems)
 
     if initializer is None:
       i = n - 1
-      a = elems_ta.read(i)
+      a = nest.map_structure(lambda elem: elem.read(i), elems_ta)
     else:
       i = n
-      a = ops.convert_to_tensor(initializer)
+      a = initializer
+
     def compute(i, a):
       i -= 1
-      a = fn(a, elems_ta.read(i))
-      return [i, a]
+      elem = nest.map_structure(lambda elem: elem.read(i), elems_ta)
+      a_out = fn(a, elem)
+      return [i, a_out]
+
     _, r_a = control_flow_ops.while_loop(
-        lambda i, a: i > 0, compute, [i, a],
+        lambda i, a: i > 0,
+        compute, [i, a],
         parallel_iterations=parallel_iterations,
         back_prop=back_prop,
         swap_memory=swap_memory)
@@ -223,6 +262,7 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     # supported in Eager
     if in_graph_mode and varscope_caching_device_was_none:
       varscope.set_caching_device(None)
+
     return r_a
 
 
-- 
GitLab


From 01141932a9cdcd871310db141a66a47410c48ac0 Mon Sep 17 00:00:00 2001
From: Igor Ganichev <iga@google.com>
Date: Mon, 23 Apr 2018 15:30:12 -0700
Subject: [PATCH 325/450] Support executing ops eagerly through XLA

The ony real change is to add GpuDeviceInfo to XlaDevice.
It is used by eager runtime to retrieve default device context.

PiperOrigin-RevId: 193995586
---
 tensorflow/compiler/jit/BUILD             |   1 +
 tensorflow/compiler/jit/xla_device.cc     |  40 +++++--
 tensorflow/compiler/jit/xla_device.h      |   8 ++
 tensorflow/compiler/jit/xla_gpu_device.cc |   9 ++
 tensorflow/compiler/tests/BUILD           |  19 +++
 tensorflow/compiler/tests/eager_test.py   | 137 ++++++++++++++++++++++
 6 files changed, 206 insertions(+), 8 deletions(-)
 create mode 100644 tensorflow/compiler/tests/eager_test.py

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 50fa95c4f3..53b124cf89 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -180,6 +180,7 @@ cc_library(
         "//tensorflow/core/kernels:no_op",
         "//tensorflow/core/kernels:sendrecv_ops",
         "//tensorflow/core/kernels:variable_ops",
+        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 12f471735f..2c2ac839b3 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stdlib.h>
 #include <unordered_set>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
 #include "tensorflow/compiler/jit/xla_device_context.h"
@@ -181,9 +182,15 @@ XlaDevice::XlaDevice(const SessionOptions& options,
       jit_device_name_(jit_device_name),
       xla_allocator_(nullptr),
       platform_(platform),
-      transfer_as_literal_(transfer_as_literal) {}
+      transfer_as_literal_(transfer_as_literal) {
+  VLOG(1) << "Created XLA device " << jit_device_name;
+}
 
-XlaDevice::~XlaDevice() {}
+XlaDevice::~XlaDevice() {
+  if (gpu_device_info_ != nullptr) {
+    gpu_device_info_->default_context->Unref();
+  }
+}
 
 xla::LocalClient* XlaDevice::client() const {
   // We lazily create the client because the platform commits to the
@@ -191,9 +198,8 @@ xla::LocalClient* XlaDevice::client() const {
   // don't want to do it until we get a chance to hook the platform up
   // to a simulator.
 
-  // For now GetOrCreateLocalClient always returns success when passed
-  // a non-null platform. If that changes we may have to plumb in some
-  // way to pass Status back.
+  // TODO(b/78468222): This can fail, at least when the backend is GPU and
+  // there is no GPU on the host.
   return xla::ClientLibrary::GetOrCreateLocalClient(platform_).ValueOrDie();
 }
 
@@ -218,14 +224,32 @@ xla::StatusOr<se::Stream*> XlaDevice::GetStream() {
   return stream_.get();
 }
 
+Status XlaDevice::CreateAndSetGpuDeviceInfo() {
+  if (gpu_device_info_ == nullptr) {
+    TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream());
+    // Call GetAllocator for the side-effect of ensuring the allocator
+    // is created.
+    GetAllocator({});
+    // XlaDevice owns both gpu_device_info_ and
+    // gpu_device_info_->default_context.
+    gpu_device_info_ = absl::make_unique<GpuDeviceInfo>();
+    gpu_device_info_->stream = stream;
+    gpu_device_info_->default_context =
+        new XlaDeviceContext(stream, client(), transfer_as_literal_);
+    gpu_device_info_->default_context->Ref();
+    set_tensorflow_gpu_device_info(gpu_device_info_.get());
+  }
+
+  return Status::OK();
+}
+
 Status XlaDevice::FillContextMap(const Graph* graph,
                                  DeviceContextMap* device_context_map) {
   VLOG(1) << "XlaDevice::FillContextMap";
   device_context_map->resize(graph->num_node_ids());
   TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream());
-  // Call GetAllocator for the side-effect of ensuring the allocator and
-  // XlaTensorInfoManager is created.
-  (void)GetAllocator({});
+  // Call GetAllocator for the side-effect of ensuring the allocator is created.
+  GetAllocator({});
   auto ctx = new XlaDeviceContext(stream, client(), transfer_as_literal_);
   for (Node* n : graph->nodes()) {
     VLOG(2) << n->id() << " : " << n->type_string() << " : " << n->name();
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index 4fe7dd8c9f..2f5c53aea8 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -105,6 +105,10 @@ class XlaDevice : public LocalDevice {
   xla::LocalClient* client() const;
   xla::StatusOr<::perftools::gputools::Stream*> GetStream();
 
+  // If not already set, create and set GpuDeviceInfo.
+  // Not thread-safe
+  Status CreateAndSetGpuDeviceInfo();
+
  private:
   // The metadata of this XlaDevice.
   const Metadata xla_metadata_;
@@ -123,6 +127,10 @@ class XlaDevice : public LocalDevice {
   // Must we use XLA's transfer manager for correct host<->device transfers? if
   // false, we can use ThenMemcpy() instead.
   bool transfer_as_literal_;
+
+  // If set, holds default device context (that we must Unref)
+  // and its stream.
+  std::unique_ptr<GpuDeviceInfo> gpu_device_info_;
 };
 
 // Builds OpKernel registrations on 'device' for the JIT operators
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index ac60423d95..a8afbf9dcd 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -54,6 +54,15 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& options,
     VLOG(1) << "Failed to create XLA_GPU device: " << status;
     return Status::OK();
   }
+
+  // TODO(b/78468222): Uncomment after fixing this bug
+  // status = device->CreateAndSetGpuDeviceInfo();
+  // if (!status.ok()) {
+  //  errors::AppendToMessage(&status, "while setting up ", DEVICE_GPU_XLA_JIT,
+  //                          " device");
+  //  return status;
+  // }
+
   devices->push_back(device.release());
   return Status::OK();
 }
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 46b86c53aa..ac2441cea0 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -308,6 +308,25 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "eager_test",
+    size = "small",
+    srcs = ["eager_test.py"],
+    disabled_backends = [
+        # TODO(b/78199195) Support XLA CPU devices in eager runtime
+        "cpu",
+        "cpu_ondemand",
+        # TODO(b/78468222) Enable GPU backend
+        "gpu",
+    ],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "fft_test",
     size = "medium",
diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py
new file mode 100644
index 0000000000..bdd0185dfe
--- /dev/null
+++ b/tensorflow/compiler/tests/eager_test.py
@@ -0,0 +1,137 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test cases for eager execution using XLA."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import googletest
+
+
+class EagerTest(XLATestCase):
+
+  def testBasic(self):
+    with self.test_scope():
+      three = constant_op.constant(3)
+      five = constant_op.constant(5)
+      product = three * five
+      self.assertAllEqual(15, product)
+
+  def testExecuteListOutputLen0(self):
+    with self.test_scope():
+      empty = constant_op.constant([], dtype=dtypes.int32)
+      result = array_ops.unstack(empty, 0)
+      self.assertTrue(isinstance(result, list))
+      self.assertEqual(0, len(result))
+
+  def testExecuteListOutputLen1(self):
+    with self.test_scope():
+      split_dim = constant_op.constant(1)
+      value = constant_op.constant([[0, 1, 2], [3, 4, 5]])
+      result = array_ops.split(value, 1, axis=split_dim)
+      self.assertTrue(isinstance(result, list))
+      self.assertEqual(1, len(result))
+      self.assertAllEqual([[0, 1, 2], [3, 4, 5]], result[0])
+
+  def testExecuteListOutputLen3(self):
+    with self.test_scope():
+      split_dim = constant_op.constant(1)
+      value = constant_op.constant([[0, 1, 2], [3, 4, 5]])
+      result = array_ops.split(value, 3, axis=split_dim)
+      self.assertTrue(isinstance(result, list))
+      self.assertEqual(3, len(result))
+      self.assertAllEqual([[0], [3]], result[0])
+      self.assertAllEqual([[1], [4]], result[1])
+      self.assertAllEqual([[2], [5]], result[2])
+
+  def testBasicGraph(self):
+    # Run some ops eagerly
+    with self.test_scope():
+      three = constant_op.constant(3)
+      five = constant_op.constant(5)
+      product = three * five
+      self.assertAllEqual(15, product)
+
+    # Run some ops graphly
+    with context.graph_mode(), self.test_session() as sess:
+      with self.test_scope():
+        three = constant_op.constant(3)
+        five = constant_op.constant(5)
+        product = three * five
+        self.assertAllEqual(15, sess.run(product))
+
+  def testDegenerateSlices(self):
+    with self.test_scope():
+      npt = np.arange(1, 19, dtype=np.float32).reshape(3, 2, 3)
+      t = constant_op.constant(npt)
+      # degenerate by offering a forward interval with a negative stride
+      self.assertAllEqual(npt[0:-1:-1, :, :], t[0:-1:-1, :, :])
+      # degenerate with a reverse interval with a positive stride
+      self.assertAllEqual(npt[-1:0, :, :], t[-1:0, :, :])
+      # empty interval in every dimension
+      self.assertAllEqual(npt[-1:0, 2:2, 2:3:-1], t[-1:0, 2:2, 2:3:-1])
+
+  def testIdentity(self):
+    with self.test_scope():
+      self.assertAllEqual(2, array_ops.identity(2))
+
+  def testIdentityOnVariable(self):
+    with self.test_scope():
+      v = resource_variable_ops.ResourceVariable(True)
+      i = array_ops.identity(v)
+    self.assertAllEqual(True, i.numpy())
+
+  def testAssignAddVariable(self):
+    with self.test_scope():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      v.assign_add(2.0)
+    self.assertEqual(3.0, v.numpy())
+
+  def testGradient(self):
+    def f(x):
+      return x
+
+    with self.test_scope():
+      grad_fn = backprop.gradients_function(f)
+      self.assertAllEqual(2., grad_fn(1., dy=2.)[0])
+
+  def testVariableGradient(self):
+    with self.test_scope():
+      v0 = resource_variable_ops.ResourceVariable(1.0)
+
+      def f():
+        x = v0 * v0
+        return x
+
+      grads = backprop.implicit_grad(f)()
+    self.assertEqual(2., grads[0][0].numpy())
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution(
+      config=config_pb2.ConfigProto(log_device_placement=True))
+  googletest.main()
-- 
GitLab


From 2f2d4745836fdcf4bf365644017a900d98bd6206 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Mon, 23 Apr 2018 15:43:20 -0700
Subject: [PATCH 326/450] Not using a control flow context when building eager
 functions.

PiperOrigin-RevId: 193997756
---
 tensorflow/python/eager/function.py | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 0f1170bb42..b924448abe 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -405,7 +405,15 @@ class GraphModeFunction(object):
       c_known_ops = set()
       c_captured_tensors = set()
 
-      def add_op_internal(op):
+      existing_op_len = len(self._graph.get_operations())
+      filtered_outputs = [x for x in self._returns if x is not None]
+      self._out_grad_placeholders = [
+          graph_placeholder(x.dtype, x.shape) for x in filtered_outputs]
+      in_gradients = gradients_impl.gradients(
+          filtered_outputs,
+          self._input_placeholders,
+          grad_ys=self._out_grad_placeholders)
+      for op in self._graph.get_operations()[existing_op_len:]:
         if op.type in ["Variable", "VariableV2", "VarHandleOp"]:
           raise ValueError("tfe.defun cannot capture variables created without "
                            "using tf.get_variable. Op: %s" % op)
@@ -414,17 +422,6 @@ class GraphModeFunction(object):
           if i.op not in c_known_ops:
             c_captured_tensors.add(i)
 
-      c = HelperContext(add_op_internal)
-
-      with c:
-        filtered_outputs = [x for x in self._returns if x is not None]
-        self._out_grad_placeholders = [
-            graph_placeholder(x.dtype, x.shape) for x in filtered_outputs]
-        in_gradients = gradients_impl.gradients(
-            filtered_outputs,
-            self._input_placeholders,
-            grad_ys=self._out_grad_placeholders)
-
     backward_outputs = tuple(
         grad for grad in _flatten(in_gradients) if grad is not None)
     output_shapes = tuple(grad.shape for grad in backward_outputs)
-- 
GitLab


From c8a1eeb98ca394d0330bead37b446bce998bb3d5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 15:50:56 -0700
Subject: [PATCH 327/450] [XLA] Redesign: migrate convolution tests.

PiperOrigin-RevId: 193998684
---
 tensorflow/compiler/xla/BUILD                 |   2 +-
 tensorflow/compiler/xla/reference_util.cc     |   6 +-
 .../convolution_dimension_numbers_test.cc     |  38 +++-
 .../xla/tests/convolution_variants_test.cc    | 167 +++++++++---------
 4 files changed, 116 insertions(+), 97 deletions(-)

diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 88f37433a5..1af9cb6d2a 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -605,8 +605,8 @@ cc_library(
         ":util",
         ":window_util",
         ":xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_evaluator",
         "//tensorflow/compiler/xla/service:shape_inference",
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index ad3a28e119..df9dbc5830 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <array>
 #include <utility>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -90,7 +90,7 @@ std::unique_ptr<Array2D<T>> MatmulArray2DImpl(
     Padding padding) {
   return ConvArray3DGeneralDimensionsDilated(
       lhs, rhs, kernel_stride, padding, 1, 1,
-      ComputationBuilder::CreateDefaultConvDimensionNumbers(1));
+      XlaBuilder::CreateDefaultConvDimensionNumbers(1));
 }
 
 /*static*/ std::unique_ptr<Array3D<float>>
@@ -140,7 +140,7 @@ ReferenceUtil::ConvArray3DGeneralDimensionsDilated(
     std::pair<int64, int64> kernel_stride, Padding padding) {
   return ConvArray4DGeneralDimensions(
       lhs, rhs, kernel_stride, padding,
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 }
 
 /* static */ std::unique_ptr<Array4D<float>>
diff --git a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
index 896b34fb6e..b5a42e3059 100644
--- a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -34,13 +34,35 @@ limitations under the License.
 namespace xla {
 namespace {
 
+StatusOr<ConvolutionDimensionNumbers> CreateConvDimensionNumbers(
+    int64 input_batch, int64 input_feature, int64 input_first_spatial,
+    int64 input_second_spatial, int64 output_batch, int64 output_feature,
+    int64 output_first_spatial, int64 output_second_spatial,
+    int64 kernel_output_feature, int64 kernel_input_feature,
+    int64 kernel_first_spatial, int64 kernel_second_spatial) {
+  ConvolutionDimensionNumbers dimension_numbers;
+  dimension_numbers.set_input_batch_dimension(input_batch);
+  dimension_numbers.set_input_feature_dimension(input_feature);
+  dimension_numbers.add_input_spatial_dimensions(input_first_spatial);
+  dimension_numbers.add_input_spatial_dimensions(input_second_spatial);
+  dimension_numbers.set_kernel_output_feature_dimension(kernel_output_feature);
+  dimension_numbers.set_kernel_input_feature_dimension(kernel_input_feature);
+  dimension_numbers.add_kernel_spatial_dimensions(kernel_first_spatial);
+  dimension_numbers.add_kernel_spatial_dimensions(kernel_second_spatial);
+  dimension_numbers.set_output_batch_dimension(output_batch);
+  dimension_numbers.set_output_feature_dimension(output_feature);
+  dimension_numbers.add_output_spatial_dimensions(output_first_spatial);
+  dimension_numbers.add_output_spatial_dimensions(output_second_spatial);
+  TF_RETURN_IF_ERROR(XlaBuilder::Validate(dimension_numbers));
+  return dimension_numbers;
+}
+
 class ConvolutionDimensionNumbersTest : public ClientLibraryTestBase {};
 
 // Tests the convolution operation with invalid input dimension numbers.
 TEST_F(ConvolutionDimensionNumbersTest, InvalidInputDimensionNumbers) {
   auto dimension_numbers_status =
-      ComputationBuilder::CreateConvDimensionNumbers(0, 2, 2, 3, 0, 1, 2, 3, 0,
-                                                     1, 2, 3);
+      CreateConvDimensionNumbers(0, 2, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3);
   ASSERT_FALSE(dimension_numbers_status.ok());
   ASSERT_THAT(dimension_numbers_status.status().error_message(),
               ::testing::HasSubstr("input are not unique"));
@@ -49,8 +71,7 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidInputDimensionNumbers) {
 // Tests the convolution operation with invalid weight dimension numbers.
 TEST_F(ConvolutionDimensionNumbersTest, InvalidWeightDimensionNumbers) {
   auto dimension_numbers_status =
-      ComputationBuilder::CreateConvDimensionNumbers(0, 1, 2, 3, 0, 1, 2, 3, 0,
-                                                     2, 2, 3);
+      CreateConvDimensionNumbers(0, 1, 2, 3, 0, 1, 2, 3, 0, 2, 2, 3);
   ASSERT_FALSE(dimension_numbers_status.ok());
   ASSERT_THAT(dimension_numbers_status.status().error_message(),
               ::testing::HasSubstr("weight are not unique"));
@@ -59,8 +80,7 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidWeightDimensionNumbers) {
 // Tests the convolution operation with invalid output dimension numbers.
 TEST_F(ConvolutionDimensionNumbersTest, InvalidOutputDimensionNumbers) {
   auto dimension_numbers_status =
-      ComputationBuilder::CreateConvDimensionNumbers(0, 1, 2, 3, 0, 2, 2, 3, 0,
-                                                     1, 2, 3);
+      CreateConvDimensionNumbers(0, 1, 2, 3, 0, 2, 2, 3, 0, 1, 2, 3);
   ASSERT_FALSE(dimension_numbers_status.ok());
   ASSERT_THAT(dimension_numbers_status.status().error_message(),
               ::testing::HasSubstr("output are not unique"));
@@ -76,14 +96,14 @@ XLA_TEST_F(ConvolutionDimensionNumbersTest,
       client_->TransferToServer(*Literal::CreateR4FromArray4D(*weight_array))
           .ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR4FromArray4D<float>(*input_array);
   auto weight =
       builder.Parameter(0, ShapeUtil::MakeShape(F32, {4, 3, 1, 1}), "weight");
   auto conv1 = builder.Conv(input, weight, {1, 1}, Padding::kValid);
 
   ConvolutionDimensionNumbers dim_nums =
-      ComputationBuilder::CreateDefaultConvDimensionNumbers();
+      XlaBuilder::CreateDefaultConvDimensionNumbers();
   // Swap batch_dimension and feature_dimension.
   int64 old_input_batch_dim = dim_nums.input_batch_dimension();
   int64 old_output_batch_dim = dim_nums.output_batch_dimension();
diff --git a/tensorflow/compiler/xla/tests/convolution_variants_test.cc b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
index 9c1145def8..50d6e25d86 100644
--- a/tensorflow/compiler/xla/tests/convolution_variants_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -52,7 +53,7 @@ class ConvolutionVariantsTest : public ClientLibraryTestBase {
 };
 
 XLA_TEST_F(ConvolutionVariantsTest, Minimal) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   const Array4D<float> input_array(1, 1, 1, 1, {2});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -67,7 +68,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Minimal) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, MinimalWithBatch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   const Array4D<float> input_array(5, 1, 1, 1, {1, 2, 3, 4, 5});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -82,7 +83,7 @@ XLA_TEST_F(ConvolutionVariantsTest, MinimalWithBatch) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Flat1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(2, 1, 3, 4);
   input_array.FillWithMultiples(1);
@@ -99,7 +100,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Flat1x1) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Deep1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 2, 1, 1, {10, 1});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -114,7 +115,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Deep1x1) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 2, {1, 2});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -129,7 +130,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x2) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x3) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 3, {1, 2, 3});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -144,7 +145,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x3) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 2, 2, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -159,7 +160,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x2) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x1in2x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 2, 2, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -174,7 +175,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x1in2x2) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x2in2x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 2, 2, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -189,7 +190,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2in2x2) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x3WithDepthAndBatch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(
       2, 2, 2, 3, {0, 1, 2, 3, 4, 5,  6,  7,  8,  9,  0, 0,    // plane 0
@@ -210,7 +211,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x3WithDepthAndBatch) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x4) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 4, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -225,7 +226,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x4) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x5) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 5, {1, 2, 3, 4, 5});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -240,7 +241,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x5) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x4) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 4, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -255,7 +256,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x4) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x5) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 5, {1, 2, 3, 4, 5});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -270,7 +271,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x5) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride2x2in3x3) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 3, 3, {1, 2, 3, 4, 5, 6, 7, 8, 9});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -285,7 +286,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride2x2in3x3) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter3x1in1x1Padded) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 1, {1});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -300,7 +301,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter3x1in1x1Padded) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter5x1in3x1Padded) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 3, {1, 2, 3});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -315,7 +316,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter5x1in3x1Padded) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter3x3in2x2Padded) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 2, 2, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -333,7 +334,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter3x3in2x2Padded) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1in2x1WithPaddingAndDepth) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 2, 1, 2, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -348,7 +349,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1in2x1WithPaddingAndDepth) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x2Stride1x1Input3x3) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 3, 3, {1, 2, 3, 4, 5, 6, 7, 8, 9});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -363,7 +364,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2Stride1x1Input3x3) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x2Stride1x1Input1x3) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 3, {1, 2, 3});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
@@ -378,7 +379,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2Stride1x1Input1x3) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x1x8x8Input1x1x8x8) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(64);
   std::iota(input_data.begin(), input_data.end(), 0.0);
@@ -398,7 +399,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x1x8x8Input1x1x8x8) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input16x1x1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(16 * 1 * 1 * 1);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -419,7 +420,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input16x1x1x1) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input16x1x2x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   constexpr int bs = 16;
   constexpr int kx = 2;
@@ -450,7 +451,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input16x1x2x2) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input3x1x2x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   constexpr int kx = 2;
   constexpr int ky = 2;
@@ -482,7 +483,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input3x1x2x2) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x8x8Input16x1x8x8) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(16, 1, 8, 8);
   for (int i0 = 0; i0 < 16; ++i0) {
@@ -510,7 +511,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x8x8Input16x1x8x8) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input1x2x8x8) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(2 * 8 * 8);
   std::iota(input_data.begin(), input_data.end(), 0.0);
@@ -536,7 +537,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input1x2x8x8) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input2x2x8x8) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(2 * 2 * 8 * 8);
   std::iota(input_data.begin(), input_data.end(), 0.0);
@@ -562,7 +563,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input2x2x8x8) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input32x2x8x8) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(32 * 2 * 8 * 8);
   std::iota(input_data.begin(), input_data.end(), 0.0);
@@ -602,7 +603,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input32x2x8x8) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter16x16x1x1Input16x16x1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D<float> input_array(16, 16, 1, 1);
   Array4D<float> filter_array(16, 16, 1, 1);
@@ -628,7 +629,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter16x16x1x1Input16x16x1x1) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, FlatRhsDilation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 4 * 6);
   std::iota(input_data.begin(), input_data.end(), 0.0);
@@ -640,14 +641,14 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatRhsDilation) {
   builder.ConvGeneralDilated(
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{},
       /*lhs_dilation=*/{}, /*rhs_dilation=*/{2, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D<float> expected(1, 1, 2, 2, {3924, 4257, 5922, 6255});
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation1D) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 1 * 5);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -659,14 +660,14 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation1D) {
   builder.ConvGeneralDilated(
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{},
       /*lhs_dilation=*/{1, 2}, /*rhs_dilation=*/{},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D<float> expected(1, 1, 1, 8, {10, 2, 20, 3, 30, 4, 40, 5});
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 3 * 4);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -682,8 +683,7 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) {
   builder.ConvGeneralDilated(
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{2, 1},
       /*padding=*/{{1, 0}, {0, 0}}, /*lhs_dilation=*/{3, 2},
-      /*rhs_dilation=*/{},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      /*rhs_dilation=*/{}, XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D<float> expected(1, 1, 3, 5,
                           {204, 40, 406, 60, 608,       //
@@ -693,7 +693,7 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingOnBothEnds) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 1 * 5);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -705,14 +705,14 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingOnBothEnds) {
   builder.ConvGeneral(
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{},
       /*padding=*/{{0, 0}, {-1, -1}},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D<float> expected(1, 1, 1, 2, {23, 34});
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingLowAndPositivePaddingHigh) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 1 * 5);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -724,14 +724,14 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingLowAndPositivePaddingHigh) {
   builder.ConvGeneral(
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{},
       /*padding=*/{{0, 0}, {-1, 2}},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D<float> expected(1, 1, 1, 5, {23, 34, 45, 50, 0});
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingLowAndNegativePaddingHigh) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 1 * 5);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -743,14 +743,14 @@ XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingLowAndNegativePaddingHigh) {
   builder.ConvGeneral(
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{},
       /*padding=*/{{0, 0}, {2, -1}},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D<float> expected(1, 1, 1, 5, {0, 1, 12, 23, 34});
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingAndDilation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 1 * 5);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -763,7 +763,7 @@ XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingAndDilation) {
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{},
       /*padding=*/{{0, 0}, {3, 2}},
       /*lhs_dilation=*/{1, 2}, /*rhs_dilation=*/{1, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   // input:
   //   [1, 2, 3, 4, 5] --dilate-> [1, 0, 2, 0, 3, 0, 4, 0, 5]
@@ -775,7 +775,7 @@ XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingAndDilation) {
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingAndDilation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 1 * 5);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -788,7 +788,7 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingAndDilation) {
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{},
       /*padding=*/{{0, 0}, {-3, -2}},
       /*lhs_dilation=*/{1, 2}, /*rhs_dilation=*/{1, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   // input:
   //   [1, 2, 3, 4, 5] --dilate-> [1, 0, 2, 0, 3, 0, 4, 0, 5]
@@ -821,7 +821,7 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input1x1x2x3_Filter2x1x1x2) {
   Array4D<float> input_array(bs, iz, iy, ix, input_data);
   Array4D<float> filter_array(oz, iz, ky, kx, kernel_data);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
   auto filter = builder.ConstantR4FromArray4D<float>(filter_array);
   builder.Conv(input, filter, {1, 1}, Padding::kValid);
@@ -854,7 +854,7 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input1x16x1x1_Filter1x16x1x1) {
   Array4D<float> input_array(bs, iz, iy, ix, input_data);
   Array4D<float> filter_array(oz, iz, ky, kx, kernel_data);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
   auto filter = builder.ConstantR4FromArray4D<float>(filter_array);
   builder.Conv(input, filter, {1, 1}, Padding::kValid);
@@ -887,7 +887,7 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x1x1_Filter1x16x1x1) {
   Array4D<float> input_array(bs, iz, iy, ix, input_data);
   Array4D<float> filter_array(oz, iz, ky, kx, kernel_data);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
   auto filter = builder.ConstantR4FromArray4D<float>(filter_array);
   builder.Conv(input, filter, {1, 1}, Padding::kValid);
@@ -920,7 +920,7 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x1x1_Filter16x16x1x1) {
   Array4D<float> input_array(bs, iz, iy, ix, input_data);
   Array4D<float> filter_array(oz, iz, ky, kx, kernel_data);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
   auto filter = builder.ConstantR4FromArray4D<float>(filter_array);
   builder.Conv(input, filter, {1, 1}, Padding::kValid);
@@ -954,7 +954,7 @@ XLA_TEST_F(ConvolutionVariantsTest,
   Array4D<float> input_array(bs, iz, iy, ix, input_data);
   Array4D<float> filter_array(oz, iz, ky, kx, kernel_data);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
   auto filter = builder.ConstantR4FromArray4D<float>(filter_array);
   builder.Conv(input, filter, {1, 1}, Padding::kValid);
@@ -966,7 +966,7 @@ XLA_TEST_F(ConvolutionVariantsTest,
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x2x1x1Input1x2x3x1GeneralPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 2 * 3 * 1);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -1010,7 +1010,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2x1x1Input1x2x3x1GeneralPadding) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1GeneralPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 2 * 3 * 1);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -1054,7 +1054,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1GeneralPadding) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1NoPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 2 * 3 * 1);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -1095,7 +1095,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1NoPadding) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x3Input1x2x3x2NoPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 2 * 3 * 2);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -1147,7 +1147,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x3Input1x2x3x2NoPadding) {
 //   BackwardInputConv([1,2,3], [5,6], padding_low=0, padding_high=1)
 XLA_TEST_F(ConvolutionVariantsTest,
            BackwardInputLowPaddingLessThanHighPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto gradients = builder.ConstantR4FromArray4D<float>(
       Array4D<float>(1, 1, 1, 3, /*values=*/{1, 2, 3}));
@@ -1166,19 +1166,18 @@ XLA_TEST_F(ConvolutionVariantsTest,
 //   BackwardInputConv([1], [1,10,100], stride=3, padding=(2,1))
 XLA_TEST_F(ConvolutionVariantsTest,
            BackwardInputLowPaddingGreaterThanHighPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto gradients = builder.ConstantR4FromArray4D<float>(
       Array4D<float>(1, 1, 1, 1, /*values=*/{1}));
   auto weights = builder.ConstantR4FromArray4D<float>(
       Array4D<float>(1, 1, 1, 3, /*values=*/{1, 10, 100}));
   auto mirrored_weights = builder.Rev(weights, {2, 3});
-  builder.ConvGeneralDilated(
-      gradients, mirrored_weights,
-      /*window_strides=*/{1, 1},
-      /*padding=*/{{0, 0}, {0, 3}},
-      /*lhs_dilation=*/{1, 3}, /*rhs_dilation=*/{},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+  builder.ConvGeneralDilated(gradients, mirrored_weights,
+                             /*window_strides=*/{1, 1},
+                             /*padding=*/{{0, 0}, {0, 3}},
+                             /*lhs_dilation=*/{1, 3}, /*rhs_dilation=*/{},
+                             XlaBuilder::CreateDefaultConvDimensionNumbers());
   ComputeAndCompareR4<float>(&builder, {{{{100, 0}}}}, {}, error_spec_);
 }
 
@@ -1187,7 +1186,7 @@ XLA_TEST_F(ConvolutionVariantsTest,
 // into
 //   BackwardInputConv([1], [1,10,100], padding=(1,1))
 XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto gradients = builder.ConstantR4FromArray4D<float>(
       Array4D<float>(1, 1, 1, 1, /*values=*/{1}));
@@ -1208,7 +1207,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding) {
 // However, XLA:GPU doesn't actually fuse it because PadInsertion doesn't
 // support negative padding on backward convolution yet (b/32744257).
 XLA_TEST_F(ConvolutionVariantsTest, BackwardInputWithNegativePaddingHigh) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto gradients = builder.ConstantR4FromArray4D<float>(
       Array4D<float>(1, 1, 1, 3, /*values=*/{1, 2, 3}));
@@ -1224,7 +1223,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputWithNegativePaddingHigh) {
 
 XLA_TEST_F(ConvolutionVariantsTest,
            BackwardFilterLowPaddingLessThanHighPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   // activations:      1,2,3,4  ---pad--> 0,1,2,3,4,0,0
   // gradients:        100,10,1 -dilate-> 100,0,10,0,1
@@ -1240,7 +1239,7 @@ XLA_TEST_F(ConvolutionVariantsTest,
       /*window_strides=*/{1, 1},
       /*padding=*/{{0, 0}, {1, 2}},
       /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
   builder.Transpose(forward_conv, {0, 1, 2, 3});
 
   ComputeAndCompareR4<float>(&builder, {{{{24, 130, 240}}}}, {}, error_spec_);
@@ -1248,7 +1247,7 @@ XLA_TEST_F(ConvolutionVariantsTest,
 
 XLA_TEST_F(ConvolutionVariantsTest,
            BackwardFilterLowPaddingGreaterThanHighPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   // activations:      1,2,3,4  ---pad--> 0,0,1,2,3,4
   // gradients:        100,10,1 -dilate-> 100,0,10,0,1
@@ -1266,14 +1265,14 @@ XLA_TEST_F(ConvolutionVariantsTest,
       /*window_strides=*/{1, 1},
       /*padding=*/{{0, 0}, {2, 0}},
       /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
   builder.Transpose(forward_conv, {0, 1, 2, 3});
 
   ComputeAndCompareR4<float>(&builder, {{{{13, 24}}}}, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   // activations:      1,2,3,4  ---pad--> 0,0,1,2,3,4,0
   // gradients:        100,10,1 -dilate-> 100,0,10,0,1
@@ -1293,14 +1292,14 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding) {
       /*window_strides=*/{1, 1},
       /*padding=*/{{0, 0}, {2, 1}},
       /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
   builder.Transpose(forward_conv, {0, 1, 2, 3});
 
   ComputeAndCompareR4<float>(&builder, {{{{13, 24, 130}}}}, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding1D) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto gradients = builder.ConstantR3FromArray3D<float>(
       Array3D<float>(1, 1, 1, /*value=*/1));
@@ -1314,26 +1313,26 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding1D) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding1D) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto activations =
       builder.ConstantR3FromArray3D<float>(Array3D<float>({{{1, 2, 3, 4}}}));
   auto gradients =
       builder.ConstantR3FromArray3D<float>(Array3D<float>({{{100, 10, 1}}}));
-  auto forward_conv = builder.ConvGeneralDilated(
-      activations, gradients,
-      /*window_strides=*/{1},
-      /*padding=*/{{2, 1}},
-      /*lhs_dilation=*/{}, /*rhs_dilation=*/{2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers(
-          /*num_spatial_dims=*/1));
+  auto forward_conv =
+      builder.ConvGeneralDilated(activations, gradients,
+                                 /*window_strides=*/{1},
+                                 /*padding=*/{{2, 1}},
+                                 /*lhs_dilation=*/{}, /*rhs_dilation=*/{2},
+                                 XlaBuilder::CreateDefaultConvDimensionNumbers(
+                                     /*num_spatial_dims=*/1));
   builder.Transpose(forward_conv, {0, 1, 2});
 
   ComputeAndCompareR3<float>(&builder, {{{13, 24, 130}}}, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding3D) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto gradients_flat = Literal::CreateR1<float>({1});
   auto gradients_literal =
@@ -1357,7 +1356,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding3D) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding3D) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto activations_flat = Literal::CreateR1<float>({1, 2, 3, 4});
   auto activations_literal =
@@ -1378,7 +1377,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding3D) {
       /*window_strides=*/{1, 1, 1},
       /*padding=*/{{0, 0}, {0, 0}, {2, 1}},
       /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 1, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers(
+      XlaBuilder::CreateDefaultConvDimensionNumbers(
           /*num_spatial_dims=*/3));
   builder.Transpose(forward_conv, {0, 1, 2, 3, 4});
   ComputeAndCompareLiteral(&builder, *expected_literal, {}, error_spec_);
-- 
GitLab


From bb4a80c92105426ccf20a98c4291a1a3f8499b54 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 15:56:12 -0700
Subject: [PATCH 328/450] Implement exporting the keys/values in a hash table.

PiperOrigin-RevId: 193999421
---
 tensorflow/contrib/lookup/lookup_ops_test.py  |  6 +++++
 .../core/kernels/initializable_lookup_table.h |  2 +-
 tensorflow/core/kernels/lookup_table_op.h     | 24 +++++++++++++++++++
 tensorflow/python/ops/lookup_ops.py           | 20 ++++++++++++++++
 4 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py
index f681b7b132..5d4682ec9f 100644
--- a/tensorflow/contrib/lookup/lookup_ops_test.py
+++ b/tensorflow/contrib/lookup/lookup_ops_test.py
@@ -58,6 +58,12 @@ class HashTableOpTest(test.TestCase):
       result = output.eval()
       self.assertAllEqual([0, 1, -1], result)
 
+      exported_keys_tensor, exported_values_tensor = table.export()
+
+      self.assertItemsEqual([b"brain", b"salad", b"surgery"],
+                            exported_keys_tensor.eval())
+      self.assertItemsEqual([0, 1, 2], exported_values_tensor.eval())
+
   def testHashTableFindHighRank(self):
     with self.test_session():
       default_val = -1
diff --git a/tensorflow/core/kernels/initializable_lookup_table.h b/tensorflow/core/kernels/initializable_lookup_table.h
index edb779540f..990cbceac2 100644
--- a/tensorflow/core/kernels/initializable_lookup_table.h
+++ b/tensorflow/core/kernels/initializable_lookup_table.h
@@ -51,7 +51,7 @@ class InitializableLookupTable : public LookupInterface {
         "Insert not supported by InitializableLookupTable implementations");
   }
 
-  Status ExportValues(OpKernelContext* context) final {
+  Status ExportValues(OpKernelContext* context) {
     return errors::Unimplemented(
         "ExportValues not supported by InitializableLookupTable "
         "implementations");
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index 29a0cc91fe..3977f16299 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -177,6 +177,30 @@ class HashTable : public InitializableLookupTable {
     return table_ ? table_->size() : 0;
   }
 
+  Status ExportValues(OpKernelContext* context) override {
+    if (!is_initialized_) {
+      return errors::Aborted("HashTable is not initialized.");
+    }
+
+    const int64 size = table_->size();
+
+    Tensor* keys;
+    Tensor* values;
+    TF_RETURN_IF_ERROR(
+        context->allocate_output("keys", TensorShape({size}), &keys));
+    TF_RETURN_IF_ERROR(
+        context->allocate_output("values", TensorShape({size}), &values));
+
+    auto keys_data = keys->flat<K>();
+    auto values_data = values->flat<V>();
+    int64 i = 0;
+    for (auto it = table_->begin(); it != table_->end(); ++it, ++i) {
+      keys_data(i) = it->first;
+      values_data(i) = it->second;
+    }
+    return Status::OK();
+  }
+
   DataType key_dtype() const override { return DataTypeToEnum<K>::v(); }
 
   DataType value_dtype() const override { return DataTypeToEnum<V>::v(); }
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 6f043f60e6..0e547689cc 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -277,7 +277,27 @@ class HashTable(InitializableLookupTableBase):
           name=scope)
 
       super(HashTable, self).__init__(table_ref, default_value, initializer)
+      self._value_shape = self._default_value.get_shape()
 
+  def export(self, name=None):
+    """Returns tensors of all keys and values in the table.
+
+    Args:
+      name: A name for the operation (optional).
+
+    Returns:
+      A pair of tensors with the first tensor containing all keys and the
+        second tensors containing all values in the table.
+    """
+    with ops.name_scope(name, "%s_Export" % self._name,
+                        [self._table_ref]) as name:
+      with ops.colocate_with(self._table_ref):
+        exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
+            self._table_ref, self._key_dtype, self._value_dtype, name=name)
+
+    exported_values.set_shape(exported_keys.get_shape().concatenate(
+        self._value_shape))
+    return exported_keys, exported_values
 
 class TableInitializerBase(object):
   """Base class for lookup table initializers."""
-- 
GitLab


From ff15c81e2b92ef8fb47bb15790cffd18377a4ef2 Mon Sep 17 00:00:00 2001
From: Andrew Cotter <acotter@google.com>
Date: Mon, 23 Apr 2018 15:57:02 -0700
Subject: [PATCH 329/450] This is a library for performing constrained
 optimization. It defines two interfaces: ConstrainedMinimizationProblem,
 which specifies a constrained optimization problem, and ConstrainedOptimizer,
 which is slightly different from a tf.train.Optimizer, mostly due to the fact
 that it is meant to optimize ConstrainedMinimizationProblems. In addition to
 these two interfaces, three ConstrainedOptimizer implementations are
 included, as well as helper functions which, given a set of candidate
 solutions, heuristically find the best candidate (to the constrained
 problem), or the best distribution over candidates.

For more details, please see our arXiv paper: "https://arxiv.org/abs/1804.06500".

PiperOrigin-RevId: 193999550
---
 tensorflow/contrib/BUILD                      |   1 +
 tensorflow/contrib/__init__.py                |   1 +
 tensorflow/contrib/cmake/python_modules.txt   |   2 +
 .../contrib/constrained_optimization/BUILD    |  91 +++
 .../constrained_optimization/README.md        | 345 ++++++++++
 .../constrained_optimization/__init__.py      |  41 ++
 .../python/candidates.py                      | 319 ++++++++++
 .../python/candidates_test.py                 |  95 +++
 .../constrained_minimization_problem.py       | 123 ++++
 .../python/constrained_optimizer.py           | 208 ++++++
 .../python/external_regret_optimizer.py       | 375 +++++++++++
 .../python/external_regret_optimizer_test.py  | 136 ++++
 .../python/swap_regret_optimizer.py           | 595 ++++++++++++++++++
 .../python/swap_regret_optimizer_test.py      | 212 +++++++
 .../python/test_util.py                       |  58 ++
 tensorflow/tools/pip_package/BUILD            |   1 +
 16 files changed, 2603 insertions(+)
 create mode 100644 tensorflow/contrib/constrained_optimization/BUILD
 create mode 100644 tensorflow/contrib/constrained_optimization/README.md
 create mode 100644 tensorflow/contrib/constrained_optimization/__init__.py
 create mode 100644 tensorflow/contrib/constrained_optimization/python/candidates.py
 create mode 100644 tensorflow/contrib/constrained_optimization/python/candidates_test.py
 create mode 100644 tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
 create mode 100644 tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py
 create mode 100644 tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py
 create mode 100644 tensorflow/contrib/constrained_optimization/python/external_regret_optimizer_test.py
 create mode 100644 tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
 create mode 100644 tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer_test.py
 create mode 100644 tensorflow/contrib/constrained_optimization/python/test_util.py

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 8edb8654b8..abdbdb4cd2 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -31,6 +31,7 @@ py_library(
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/coder:coder_py",
         "//tensorflow/contrib/compiler:compiler_py",
+        "//tensorflow/contrib/constrained_optimization",
         "//tensorflow/contrib/copy_graph:copy_graph_py",
         "//tensorflow/contrib/crf:crf_py",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_py",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 0d163daa6e..7f33d460dc 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -29,6 +29,7 @@ from tensorflow.contrib import cloud
 from tensorflow.contrib import cluster_resolver
 from tensorflow.contrib import coder
 from tensorflow.contrib import compiler
+from tensorflow.contrib import constrained_optimization
 from tensorflow.contrib import copy_graph
 from tensorflow.contrib import crf
 from tensorflow.contrib import cudnn_rnn
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 932a6eeeaa..2554b3a6e0 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -147,6 +147,8 @@ tensorflow/contrib/coder/python
 tensorflow/contrib/coder/python/layers
 tensorflow/contrib/coder/python/ops
 tensorflow/contrib/compiler
+tensorflow/contrib/constrained_optimization
+tensorflow/contrib/constrained_optimization/python
 tensorflow/contrib/copy_graph
 tensorflow/contrib/copy_graph/python
 tensorflow/contrib/copy_graph/python/util
diff --git a/tensorflow/contrib/constrained_optimization/BUILD b/tensorflow/contrib/constrained_optimization/BUILD
new file mode 100644
index 0000000000..619153df67
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/BUILD
@@ -0,0 +1,91 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+# Transitive dependencies of this target will be included in the pip package.
+py_library(
+    name = "constrained_optimization_pip",
+    deps = [
+        ":constrained_optimization",
+        ":test_util",
+    ],
+)
+
+py_library(
+    name = "constrained_optimization",
+    srcs = [
+        "__init__.py",
+        "python/candidates.py",
+        "python/constrained_minimization_problem.py",
+        "python/constrained_optimizer.py",
+        "python/external_regret_optimizer.py",
+        "python/swap_regret_optimizer.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:standard_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "candidates_test",
+    srcs = ["python/candidates_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":constrained_optimization",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+# NOTE: This library can't be "testonly" since it needs to be included in the
+# pip package.
+py_library(
+    name = "test_util",
+    srcs = ["python/test_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":constrained_optimization",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:standard_ops",
+    ],
+)
+
+py_test(
+    name = "external_regret_optimizer_test",
+    srcs = ["python/external_regret_optimizer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":constrained_optimization",
+        ":test_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:standard_ops",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "swap_regret_optimizer_test",
+    srcs = ["python/swap_regret_optimizer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":constrained_optimization",
+        ":test_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:standard_ops",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/contrib/constrained_optimization/README.md b/tensorflow/contrib/constrained_optimization/README.md
new file mode 100644
index 0000000000..c65a150464
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/README.md
@@ -0,0 +1,345 @@
+<!-- TODO(acotter): Add usage example of non-convex optimization and stochastic classification. -->
+
+# ConstrainedOptimization (TFCO)
+
+TFCO is a library for optimizing inequality-constrained problems in TensorFlow.
+Both the objective function and the constraints are represented as Tensors,
+giving users the maximum amount of flexibility in specifying their optimization
+problems.
+
+This flexibility makes optimization considerably more difficult: on a non-convex
+problem, if one uses the "standard" approach of introducing a Lagrange
+multiplier for each constraint, and then jointly maximizing over the Lagrange
+multipliers and minimizing over the model parameters, then a stable stationary
+point might not even *exist*. Hence, in some cases, oscillation, instead of
+convergence, is inevitable.
+
+Thankfully, it turns out that even if, over the course of optimization, no
+*particular* iterate does a good job of minimizing the objective while
+satisfying the constraints, the *sequence* of iterates, on average, usually
+will. This observation suggests the following approach: at training time, we'll
+periodically snapshot the model state during optimization; then, at evaluation
+time, each time we're given a new example to evaluate, we'll sample one of the
+saved snapshots uniformly at random, and apply it to the example. This
+*stochastic model* will generally perform well, both with respect to the
+objective function, and the constraints.
+
+In fact, we can do better: it's possible to post-process the set of snapshots to
+find a distribution over at most $$m+1$$ snapshots, where $$m$$ is the number of
+constraints, that will be at least as good (and will usually be much better)
+than the (much larger) uniform distribution described above. If you're unable or
+unwilling to use a stochastic model at all, then you can instead use a heuristic
+to choose the single best snapshot.
+
+For full details, motivation, and theoretical results on the approach taken by
+this library, please refer to:
+
+> Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+> Constrained Optimization".
+> [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+which will be referred to as [CoJiSr18] throughout the remainder of this
+document.
+
+### Proxy Constraints
+
+Imagine that we want to constrain the recall of a binary classifier to be at
+least 90%. Since the recall is proportional to the number of true positive
+classifications, which itself is a sum of indicator functions, this constraint
+is non-differentible, and therefore cannot be used in a problem that will be
+optimized using a (stochastic) gradient-based algorithm.
+
+For this and similar problems, TFCO supports so-called *proxy constraints*,
+which are (at least semi-differentiable) approximations of the original
+constraints. For example, one could create a proxy recall function by replacing
+the indicator functions with sigmoids. During optimization, each proxy
+constraint function will be penalized, with the magnitude of the penalty being
+chosen to satisfy the corresponding *original* (non-proxy) constraint.
+
+On a problem including proxy constraints&mdash;even a convex problem&mdash;the
+Lagrangian approach discussed above isn't guaranteed to work. However, a
+different algorithm, based on minimizing *swap regret*, does work. Aside from
+this difference, the recommended procedure for optimizing a proxy-constrained
+problem remains the same: periodically snapshot the model during optimization,
+and then either find the best $$m+1$$-sized distribution, or heuristically
+choose the single best snapshot.
+
+## Components
+
+*   [constrained_minimization_problem](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py):
+    contains the `ConstrainedMinimizationProblem` interface. Your own
+    constrained optimization problems should be represented using
+    implementations of this interface.
+
+*   [constrained_optimizer](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py):
+    contains the `ConstrainedOptimizer` interface, which is similar to (but
+    different from) `tf.train.Optimizer`, with the main difference being that
+    `ConstrainedOptimizer`s are given `ConstrainedMinimizationProblem`s to
+    optimize, and perform constrained optimization.
+
+    *   [external_regret_optimizer](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py):
+        contains the `AdditiveExternalRegretOptimizer` implementation, which is
+        a `ConstrainedOptimizer` implementing the Lagrangian approach discussed
+        above (with additive updates to the Lagrange multipliers). You should
+        use this optimizer for problems *without* proxy constraints. It may also
+        work for problems with proxy constraints, but we recommend using a swap
+        regret optimizer, instead.
+
+        This optimizer is most similar to Algorithm 3 in Appendix C.3 of
+        [CoJiSr18], and is discussed in Section 3. The two differences are that
+        it uses proxy constraints (if they're provided) in the update of the
+        model parameters, and uses `tf.train.Optimizer`s, instead of SGD, for
+        the "inner" updates.
+
+    *   [swap_regret_optimizer](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py):
+        contains the `AdditiveSwapRegretOptimizer` and
+        `MultiplicativeSwapRegretOptimizer` implementations, which are
+        `ConstrainedOptimizer`s implementing the swap-regret minimization
+        approach mentioned above (with additive or multiplicative updates,
+        respectively, to the parameters associated with the
+        constraints&mdash;these parameters are not Lagrange multipliers, but
+        play a similar role). You should use one of these optimizers (we suggest
+        `MultiplicativeSwapRegretOptimizer`) for problems *with* proxy
+        constraints.
+
+        The `MultiplicativeSwapRegretOptimizer` is most similar to Algorithm 2
+        in Section 4 of [CoJiSr18], with the difference being that it uses
+        `tf.train.Optimizer`s, instead of SGD, for the "inner" updates. The
+        `AdditiveSwapRegretOptimizer` differs further in that it performs
+        additive (instead of multiplicative) updates of the stochastic matrix.
+
+*   [candidates](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/candidates.py):
+    contains two functions, `find_best_candidate_distribution` and
+    `find_best_candidate_index`. Both of these functions are given a set of
+    candidate solutions to a constrained optimization problem, from which the
+    former finds the best distribution over at most $$m+1$$ candidates, and the
+    latter heuristically finds the single best candidate. As discussed above,
+    the set of candidates will typically be model snapshots saved periodically
+    during optimization. Both of these functions require that scipy be
+    installed.
+
+    The `find_best_candidate_distribution` function implements the approach
+    described in Lemma 3 of [CoJiSr18], while `find_best_candidate_index`
+    implements the heuristic used for hyperparameter search in the experiments
+    of Section 5.2.
+
+## Convex Example with Proxy Constraints
+
+This is a simple example of recall-constrained optimization on simulated data:
+we will try to find a classifier that minimizes the average hinge loss while
+constraining recall to be at least 90%.
+
+We'll start with the required imports&mdash;notice the definition of `tfco`:
+
+```python
+import math
+import numpy as np
+import tensorflow as tf
+
+tfco = tf.contrib.constrained_optimization
+```
+
+We'll now create an implementation of the `ConstrainedMinimizationProblem` class
+for this problem. The constructor takes three parameters: a Tensor containing
+the classification labels (0 or 1) for every training example, another Tensor
+containing the model's predictions on every training example (sometimes called
+the "logits"), and the lower bound on recall that will be enforced using a
+constraint.
+
+This implementation will contain both constraints *and* proxy constraints: the
+former represents the constraint that the true recall (defined in terms of the
+*number* of true positives) be at least `recall_lower_bound`, while the latter
+represents the same constraint, but on a hinge approximation of the recall.
+
+```python
+class ExampleProblem(tfco.ConstrainedMinimizationProblem):
+
+  def __init__(self, labels, predictions, recall_lower_bound):
+    self._labels = labels
+    self._predictions = predictions
+    self._recall_lower_bound = recall_lower_bound
+    # The number of positively-labeled examples.
+    self._positive_count = tf.reduce_sum(self._labels)
+
+  @property
+  def objective(self):
+    return tf.losses.hinge_loss(labels=self._labels, logits=self._predictions)
+
+  @property
+  def constraints(self):
+    true_positives = self._labels * tf.to_float(self._predictions > 0)
+    true_positive_count = tf.reduce_sum(true_positives)
+    recall = true_positive_count / self._positive_count
+    # The constraint is (recall >= self._recall_lower_bound), which we convert
+    # to (self._recall_lower_bound - recall <= 0) because
+    # ConstrainedMinimizationProblems must always provide their constraints in
+    # the form (tensor <= 0).
+    #
+    # The result of this function should be a tensor, with each element being
+    # a quantity that is constrained to be nonpositive. We only have one
+    # constraint, so we return a one-element tensor.
+    return self._recall_lower_bound - recall
+
+  @property
+  def proxy_constraints(self):
+    # Use 1 - hinge since we're SUBTRACTING recall in the constraint function,
+    # and we want the proxy constraint function to be convex.
+    true_positives = self._labels * tf.minimum(1.0, self._predictions)
+    true_positive_count = tf.reduce_sum(true_positives)
+    recall = true_positive_count / self._positive_count
+    # Please see the corresponding comment in the constraints property.
+    return self._recall_lower_bound - recall
+```
+
+We'll now create a simple simulated dataset by sampling 1000 random
+10-dimensional feature vectors from a Gaussian, finding their labels using a
+random "ground truth" linear model, and then adding noise by randomly flipping
+200 labels.
+
+```python
+# Create a simulated 10-dimensional training dataset consisting of 1000 labeled
+# examples, of which 800 are labeled correctly and 200 are mislabeled.
+num_examples = 1000
+num_mislabeled_examples = 200
+dimension = 10
+# We will constrain the recall to be at least 90%.
+recall_lower_bound = 0.9
+
+# Create random "ground truth" parameters to a linear model.
+ground_truth_weights = np.random.normal(size=dimension) / math.sqrt(dimension)
+ground_truth_threshold = 0
+
+# Generate a random set of features for each example.
+features = np.random.normal(size=(num_examples, dimension)).astype(
+    np.float32) / math.sqrt(dimension)
+# Compute the labels from these features given the ground truth linear model.
+labels = (np.matmul(features, ground_truth_weights) >
+          ground_truth_threshold).astype(np.float32)
+# Add noise by randomly flipping num_mislabeled_examples labels.
+mislabeled_indices = np.random.choice(
+    num_examples, num_mislabeled_examples, replace=False)
+labels[mislabeled_indices] = 1 - labels[mislabeled_indices]
+```
+
+We're now ready to construct our model, and the corresponding optimization
+problem. We'll use a linear model of the form $$f(x) = w^T x - t$$, where $$w$$
+is the `weights`, and $$t$$ is the `threshold`. The `problem` variable will hold
+an instance of the `ExampleProblem` class we created earlier.
+
+```python
+# Create variables containing the model parameters.
+weights = tf.Variable(tf.zeros(dimension), dtype=tf.float32, name="weights")
+threshold = tf.Variable(0.0, dtype=tf.float32, name="threshold")
+
+# Create the optimization problem.
+constant_labels = tf.constant(labels, dtype=tf.float32)
+constant_features = tf.constant(features, dtype=tf.float32)
+predictions = tf.tensordot(constant_features, weights, axes=(1, 0)) - threshold
+problem = ExampleProblem(
+    labels=constant_labels,
+    predictions=predictions,
+    recall_lower_bound=recall_lower_bound,
+)
+```
+
+We're almost ready to train our model, but first we'll create a couple of
+functions to measure its performance. We're interested in two quantities: the
+average hinge loss (which we seek to minimize), and the recall (which we
+constrain).
+
+```python
+def average_hinge_loss(labels, predictions):
+  num_examples, = np.shape(labels)
+  signed_labels = (labels * 2) - 1
+  total_hinge_loss = np.sum(np.maximum(0.0, 1.0 - signed_labels * predictions))
+  return total_hinge_loss / num_examples
+
+def recall(labels, predictions):
+  positive_count = np.sum(labels)
+  true_positives = labels * (predictions > 0)
+  true_positive_count = np.sum(true_positives)
+  return true_positive_count / positive_count
+```
+
+As was mentioned earlier, external regret optimizers suffice for problems
+without proxy constraints, but swap regret optimizers are recommended for
+problems *with* proxy constraints. Since this problem contains proxy
+constraints, we use the `MultiplicativeSwapRegretOptimizer`.
+
+For this problem, the constraint is fairly easy to satisfy, so we can use the
+same "inner" optimizer (an `AdagradOptimizer` with a learning rate of 1) for
+optimization of both the model parameters (`weights` and `threshold`), and the
+internal parameters associated with the constraints (these are the analogues of
+the Lagrange multipliers used by the `MultiplicativeSwapRegretOptimizer`). For
+more difficult problems, it will often be necessary to use different optimizers,
+with different learning rates (presumably found via a hyperparameter search): to
+accomplish this, pass *both* the `optimizer` and `constraint_optimizer`
+parameters to `MultiplicativeSwapRegretOptimizer`'s constructor.
+
+Since this is a convex problem (both the objective and proxy constraint
+functions are convex), we can just take the last iterate. Periodic snapshotting,
+and the use of the `find_best_candidate_distribution` or
+`find_best_candidate_index` functions, is generally only necessary for
+non-convex problems (and even then, it isn't *always* necessary).
+
+```python
+with tf.Session() as session:
+  optimizer = tfco.MultiplicativeSwapRegretOptimizer(
+      optimizer=tf.train.AdagradOptimizer(learning_rate=1.0))
+  train_op = optimizer.minimize(problem)
+
+  session.run(tf.global_variables_initializer())
+  for ii in xrange(1000):
+    session.run(train_op)
+
+  trained_weights, trained_threshold = session.run((weights, threshold))
+
+trained_predictions = np.matmul(features, trained_weights) - trained_threshold
+print("Constrained average hinge loss = %f" % average_hinge_loss(
+    labels, trained_predictions))
+print("Constrained recall = %f" % recall(labels, trained_predictions))
+```
+
+Running the above code gives the following output (due to the randomness of the
+dataset, you'll get a different result when you run it):
+
+```none
+Constrained average hinge loss = 0.710019
+Constrained recall = 0.899811
+```
+
+As we hoped, the recall is extremely close to 90%&mdash;and, thanks to the use
+of proxy constraints, this is the *true* recall, not a hinge approximation.
+
+For comparison, let's try optimizing the same problem *without* the recall
+constraint:
+
+```python
+with tf.Session() as session:
+  optimizer = tf.train.AdagradOptimizer(learning_rate=1.0)
+  # For optimizing the unconstrained problem, we just minimize the "objective"
+  # portion of the minimization problem.
+  train_op = optimizer.minimize(problem.objective)
+
+  session.run(tf.global_variables_initializer())
+  for ii in xrange(1000):
+    session.run(train_op)
+
+  trained_weights, trained_threshold = session.run((weights, threshold))
+
+trained_predictions = np.matmul(features, trained_weights) - trained_threshold
+print("Unconstrained average hinge loss = %f" % average_hinge_loss(
+    labels, trained_predictions))
+print("Unconstrained recall = %f" % recall(labels, trained_predictions))
+```
+
+This code gives the following output (again, you'll get a different answer,
+since the dataset is random):
+
+```none
+Unconstrained average hinge loss = 0.627271
+Unconstrained recall = 0.793951
+```
+
+Because there is no constraint, the unconstrained problem does a better job of
+minimizing the average hinge loss, but naturally doesn't approach 90% recall.
diff --git a/tensorflow/contrib/constrained_optimization/__init__.py b/tensorflow/contrib/constrained_optimization/__init__.py
new file mode 100644
index 0000000000..1e49ba9f17
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/__init__.py
@@ -0,0 +1,41 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A library for performing constrained optimization in TensorFlow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import
+from tensorflow.contrib.constrained_optimization.python.candidates import *
+from tensorflow.contrib.constrained_optimization.python.constrained_minimization_problem import *
+from tensorflow.contrib.constrained_optimization.python.constrained_optimizer import *
+from tensorflow.contrib.constrained_optimization.python.external_regret_optimizer import *
+from tensorflow.contrib.constrained_optimization.python.swap_regret_optimizer import *
+# pylint: enable=wildcard-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    "AdditiveExternalRegretOptimizer",
+    "AdditiveSwapRegretOptimizer",
+    "ConstrainedMinimizationProblem",
+    "ConstrainedOptimizer",
+    "find_best_candidate_distribution",
+    "find_best_candidate_index",
+    "MultiplicativeSwapRegretOptimizer",
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/constrained_optimization/python/candidates.py b/tensorflow/contrib/constrained_optimization/python/candidates.py
new file mode 100644
index 0000000000..ac86a6741b
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/candidates.py
@@ -0,0 +1,319 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Code for optimizing over a set of candidate solutions.
+
+The functions in this file deal with the constrained problem:
+
+> minimize f(w)
+> s.t. g_i(w) <= 0 for all i in {0,1,...,m-1}
+
+Here, f(w) is the "objective function", and g_i(w) is the ith (of m) "constraint
+function". Given the values of the objective and constraint functions for a set
+of n "candidate solutions" {w_0,w_1,...,w_{n-1}} (for a total of n objective
+function values, and n*m constraint function values), the
+`find_best_candidate_distribution` function finds the best DISTRIBUTION over
+these candidates, while `find_best_candidate_index' heuristically finds the
+single best candidate.
+
+Both of these functions have dependencies on `scipy`, so if you want to call
+them, then you must make sure that `scipy` is available. The imports are
+performed inside the functions themselves, so if they're not actually called,
+then `scipy` is not needed.
+
+For more specifics, please refer to:
+
+> Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+> Constrained Optimization".
+> [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+The `find_best_candidate_distribution` function implements the approach
+described in Lemma 3, while `find_best_candidate_index` implements the heuristic
+used for hyperparameter search in the experiments of Section 5.2.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+
+def _find_best_candidate_distribution_helper(objective_vector,
+                                             constraints_matrix,
+                                             maximum_violation=0.0):
+  """Finds a distribution minimizing an objective subject to constraints.
+
+  This function deals with the constrained problem:
+
+  > minimize f(w)
+  > s.t. g_i(w) <= 0 for all i in {0,1,...,m-1}
+
+  Here, f(w) is the "objective function", and g_i(w) is the ith (of m)
+  "constraint function". Given a set of n "candidate solutions"
+  {w_0,w_1,...,w_{n-1}}, this function finds a distribution over these n
+  candidates that, in expectation, minimizes the objective while violating
+  the constraints by no more than `maximum_violation`. If no such distribution
+  exists, it returns an error (using Go-style error reporting).
+
+  The `objective_vector` parameter should be a numpy array with shape (n,), for
+  which objective_vector[i] = f(w_i). Likewise, `constraints_matrix` should be a
+  numpy array with shape (m,n), for which constraints_matrix[i,j] = g_i(w_j).
+
+  This function will return a distribution for which at most m+1 probabilities,
+  and often fewer, are nonzero.
+
+  Args:
+    objective_vector: numpy array of shape (n,), where n is the number of
+      "candidate solutions". Contains the objective function values.
+    constraints_matrix: numpy array of shape (m,n), where m is the number of
+      constraints and n is the number of "candidate solutions". Contains the
+      constraint violation magnitudes.
+    maximum_violation: nonnegative float, the maximum amount by which any
+      constraint may be violated, in expectation.
+
+  Returns:
+    A pair (`result`, `message`), exactly one of which is None. If `message` is
+      None, then the `result` contains the optimal distribution as a numpy array
+      of shape (n,). If `result` is None, then `message` contains an error
+      message.
+
+  Raises:
+    ValueError: If `objective_vector` and `constraints_matrix` have inconsistent
+      shapes, or if `maximum_violation` is negative.
+    ImportError: If we're unable to import `scipy.optimize`.
+  """
+  if maximum_violation < 0.0:
+    raise ValueError("maximum_violation must be nonnegative")
+
+  mm, nn = np.shape(constraints_matrix)
+  if (nn,) != np.shape(objective_vector):
+    raise ValueError(
+        "objective_vector must have shape (n,), and constraints_matrix (m, n),"
+        " where n is the number of candidates, and m is the number of "
+        "constraints")
+
+  # We import scipy inline, instead of at the top of the file, so that a scipy
+  # dependency is only introduced if either find_best_candidate_distribution()
+  # or find_best_candidate_index() are actually called.
+  import scipy.optimize  # pylint: disable=g-import-not-at-top
+
+  # Feasibility (within maximum_violation) constraints.
+  a_ub = constraints_matrix
+  b_ub = np.full((mm, 1), maximum_violation)
+  # Sum-to-one constraint.
+  a_eq = np.ones((1, nn))
+  b_eq = np.ones((1, 1))
+  # Nonnegativity constraints.
+  bounds = (0, None)
+
+  result = scipy.optimize.linprog(
+      objective_vector,
+      A_ub=a_ub,
+      b_ub=b_ub,
+      A_eq=a_eq,
+      b_eq=b_eq,
+      bounds=bounds)
+  # Go-style error reporting. We don't raise on error, since
+  # find_best_candidate_distribution() needs to handle the failure case, and we
+  # shouldn't use exceptions as flow-control.
+  if not result.success:
+    return (None, result.message)
+  else:
+    return (result.x, None)
+
+
+def find_best_candidate_distribution(objective_vector,
+                                     constraints_matrix,
+                                     epsilon=0.0):
+  """Finds a distribution minimizing an objective subject to constraints.
+
+  This function deals with the constrained problem:
+
+  > minimize f(w)
+  > s.t. g_i(w) <= 0 for all i in {0,1,...,m-1}
+
+  Here, f(w) is the "objective function", and g_i(w) is the ith (of m)
+  "constraint function". Given a set of n "candidate solutions"
+  {w_0,w_1,...,w_{n-1}}, this function finds a distribution over these n
+  candidates that, in expectation, minimizes the objective while violating
+  the constraints by the smallest possible amount (with the amount being found
+  via bisection search).
+
+  The `objective_vector` parameter should be a numpy array with shape (n,), for
+  which objective_vector[i] = f(w_i). Likewise, `constraints_matrix` should be a
+  numpy array with shape (m,n), for which constraints_matrix[i,j] = g_i(w_j).
+
+  This function will return a distribution for which at most m+1 probabilities,
+  and often fewer, are nonzero.
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  This function implements the approach described in Lemma 3.
+
+  Args:
+    objective_vector: numpy array of shape (n,), where n is the number of
+      "candidate solutions". Contains the objective function values.
+    constraints_matrix: numpy array of shape (m,n), where m is the number of
+      constraints and n is the number of "candidate solutions". Contains the
+      constraint violation magnitudes.
+    epsilon: nonnegative float, the threshold at which to terminate the binary
+      search while searching for the minimal expected constraint violation
+      magnitude.
+
+  Returns:
+    The optimal distribution, as a numpy array of shape (n,).
+
+  Raises:
+    ValueError: If `objective_vector` and `constraints_matrix` have inconsistent
+      shapes, or if `epsilon` is negative.
+    ImportError: If we're unable to import `scipy.optimize`.
+  """
+  if epsilon < 0.0:
+    raise ValueError("epsilon must be nonnegative")
+
+  # If there is a feasible solution (i.e. with maximum_violation=0), then that's
+  # what we'll return.
+  pp, _ = _find_best_candidate_distribution_helper(objective_vector,
+                                                   constraints_matrix)
+  if pp is not None:
+    return pp
+
+  # The bound is the minimum over all candidates, of the maximum per-candidate
+  # constraint violation.
+  lower = 0.0
+  upper = np.min(np.amax(constraints_matrix, axis=0))
+  best_pp, _ = _find_best_candidate_distribution_helper(
+      objective_vector, constraints_matrix, maximum_violation=upper)
+  assert best_pp is not None
+
+  # Throughout this loop, a maximum_violation of "lower" is not achievable,
+  # but a maximum_violation of "upper" is achiveable.
+  while True:
+    middle = 0.5 * (lower + upper)
+    if (middle - lower <= epsilon) or (upper - middle <= epsilon):
+      break
+    else:
+      pp, _ = _find_best_candidate_distribution_helper(
+          objective_vector, constraints_matrix, maximum_violation=middle)
+      if pp is None:
+        lower = middle
+      else:
+        best_pp = pp
+        upper = middle
+
+  return best_pp
+
+
+def find_best_candidate_index(objective_vector,
+                              constraints_matrix,
+                              rank_objectives=False):
+  """Heuristically finds the best candidate solution to a constrained problem.
+
+  This function deals with the constrained problem:
+
+  > minimize f(w)
+  > s.t. g_i(w) <= 0 for all i in {0,1,...,m-1}
+
+  Here, f(w) is the "objective function", and g_i(w) is the ith (of m)
+  "constraint function". Given a set of n "candidate solutions"
+  {w_0,w_1,...,w_{n-1}}, this function finds the "best" solution according
+  to the following heuristic:
+
+    1. Across all models, the ith constraint violations (i.e. max{0, g_i(0)})
+       are ranked, as are the objectives (if rank_objectives=True).
+    2. Each model is then associated its MAXIMUM rank across all m constraints
+       (and the objective, if rank_objectives=True).
+    3. The model with the minimal maximum rank is then identified. Ties are
+       broken using the objective function value.
+    4. The index of this "best" model is returned.
+
+  The `objective_vector` parameter should be a numpy array with shape (n,), for
+  which objective_vector[i] = f(w_i). Likewise, `constraints_matrix` should be a
+  numpy array with shape (m,n), for which constraints_matrix[i,j] = g_i(w_j).
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  This function implements the heuristic used for hyperparameter search in the
+  experiments of Section 5.2.
+
+  Args:
+    objective_vector: numpy array of shape (n,), where n is the number of
+      "candidate solutions". Contains the objective function values.
+    constraints_matrix: numpy array of shape (m,n), where m is the number of
+      constraints and n is the number of "candidate solutions". Contains the
+      constraint violation magnitudes.
+    rank_objectives: bool, whether the objective function values should be
+      included in the initial ranking step. If True, both the objective and
+      constraints will be ranked. If False, only the constraints will be ranked.
+      In either case, the objective function values will be used for
+      tiebreaking.
+
+  Returns:
+    The index (in {0,1,...,n-1}) of the "best" model according to the above
+      heuristic.
+
+  Raises:
+    ValueError: If `objective_vector` and `constraints_matrix` have inconsistent
+      shapes.
+    ImportError: If we're unable to import `scipy.stats`.
+  """
+  mm, nn = np.shape(constraints_matrix)
+  if (nn,) != np.shape(objective_vector):
+    raise ValueError(
+        "objective_vector must have shape (n,), and constraints_matrix (m, n),"
+        " where n is the number of candidates, and m is the number of "
+        "constraints")
+
+  # We import scipy inline, instead of at the top of the file, so that a scipy
+  # dependency is only introduced if either find_best_candidate_distribution()
+  # or find_best_candidate_index() are actually called.
+  import scipy.stats  # pylint: disable=g-import-not-at-top
+
+  if rank_objectives:
+    maximum_ranks = scipy.stats.rankdata(objective_vector, method="min")
+  else:
+    maximum_ranks = np.zeros(nn, dtype=np.int64)
+  for ii in xrange(mm):
+    # Take the maximum of the constraint functions with zero, since we want to
+    # rank the magnitude of constraint *violations*. If the constraint is
+    # satisfied, then we don't care how much it's satisfied by (as a result, we
+    # we expect all models satisfying a constraint to be tied at rank 1).
+    ranks = scipy.stats.rankdata(
+        np.maximum(0.0, constraints_matrix[ii, :]), method="min")
+    maximum_ranks = np.maximum(maximum_ranks, ranks)
+
+  best_index = None
+  best_rank = float("Inf")
+  best_objective = float("Inf")
+  for ii in xrange(nn):
+    if maximum_ranks[ii] < best_rank:
+      best_index = ii
+      best_rank = maximum_ranks[ii]
+      best_objective = objective_vector[ii]
+    elif (maximum_ranks[ii] == best_rank) and (objective_vector[ii] <=
+                                               best_objective):
+      best_index = ii
+      best_objective = objective_vector[ii]
+
+  return best_index
diff --git a/tensorflow/contrib/constrained_optimization/python/candidates_test.py b/tensorflow/contrib/constrained_optimization/python/candidates_test.py
new file mode 100644
index 0000000000..a4c49d48bc
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/candidates_test.py
@@ -0,0 +1,95 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for constrained_optimization.python.candidates."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.constrained_optimization.python import candidates
+from tensorflow.python.platform import test
+
+
+class CandidatesTest(test.TestCase):
+
+  def test_inconsistent_shapes_for_best_distribution(self):
+    """An error is raised when parameters have inconsistent shapes."""
+    objective_vector = np.array([1, 2, 3])
+    constraints_matrix = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
+    with self.assertRaises(ValueError):
+      _ = candidates.find_best_candidate_distribution(objective_vector,
+                                                      constraints_matrix)
+
+  def test_inconsistent_shapes_for_best_index(self):
+    """An error is raised when parameters have inconsistent shapes."""
+    objective_vector = np.array([1, 2, 3])
+    constraints_matrix = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
+    with self.assertRaises(ValueError):
+      _ = candidates.find_best_candidate_index(objective_vector,
+                                               constraints_matrix)
+
+  def test_best_distribution(self):
+    """Distribution should match known solution."""
+    objective_vector = np.array(
+        [0.03053309, -0.06667082, 0.88355145, 0.46529806])
+    constraints_matrix = np.array(
+        [[-0.60164551, 0.36676229, 0.7856454, -0.8441711],
+         [0.00371592, -0.16392108, -0.59778071, -0.56908492]])
+    distribution = candidates.find_best_candidate_distribution(
+        objective_vector, constraints_matrix)
+    # Verify that the solution is a probability distribution.
+    self.assertTrue(np.all(distribution >= 0))
+    self.assertAlmostEqual(np.sum(distribution), 1.0)
+    # Verify that the solution satisfies the constraints.
+    maximum_constraint_violation = np.amax(
+        np.dot(constraints_matrix, distribution))
+    self.assertLessEqual(maximum_constraint_violation, 0)
+    # Verify that the solution matches that which we expect.
+    expected_distribution = np.array([0.37872711, 0.62127289, 0, 0])
+    self.assertAllClose(expected_distribution, distribution, rtol=0, atol=1e-6)
+
+  def test_best_index_rank_objectives_true(self):
+    """Index should match known solution."""
+    # Objective ranks = [2, 1, 4, 3].
+    objective_vector = np.array(
+        [0.03053309, -0.06667082, 0.88355145, 0.46529806])
+    # Constraint ranks = [[1, 3, 4, 1], [4, 1, 1, 1]].
+    constraints_matrix = np.array(
+        [[-0.60164551, 0.36676229, 0.7856454, -0.8441711],
+         [0.00371592, -0.16392108, -0.59778071, -0.56908492]])
+    # Maximum ranks = [4, 3, 4, 3].
+    index = candidates.find_best_candidate_index(
+        objective_vector, constraints_matrix, rank_objectives=True)
+    self.assertEqual(1, index)
+
+  def test_best_index_rank_objectives_false(self):
+    """Index should match known solution."""
+    # Objective ranks = [2, 1, 4, 3].
+    objective_vector = np.array(
+        [0.03053309, -0.06667082, 0.88355145, 0.46529806])
+    # Constraint ranks = [[1, 3, 4, 1], [4, 1, 1, 1]].
+    constraints_matrix = np.array(
+        [[-0.60164551, 0.36676229, 0.7856454, -0.8441711],
+         [0.00371592, -0.16392108, -0.59778071, -0.56908492]])
+    # Maximum ranks = [4, 3, 4, 1].
+    index = candidates.find_best_candidate_index(
+        objective_vector, constraints_matrix, rank_objectives=False)
+    self.assertEqual(3, index)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
new file mode 100644
index 0000000000..70813fb217
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
@@ -0,0 +1,123 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines abstract class for `ConstrainedMinimizationProblem`s.
+
+A ConstrainedMinimizationProblem consists of an objective function to minimize,
+and a set of constraint functions that are constrained to be nonpositive.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+
+@six.add_metaclass(abc.ABCMeta)
+class ConstrainedMinimizationProblem(object):
+  """Abstract class representing a `ConstrainedMinimizationProblem`.
+
+  A ConstrainedMinimizationProblem consists of an objective function to
+  minimize, and a set of constraint functions that are constrained to be
+  nonpositive.
+
+  In addition to the constraint functions, there may (optionally) be proxy
+  constraint functions: a ConstrainedOptimizer will attempt to penalize these
+  proxy constraint functions so as to satisfy the (non-proxy) constraints. Proxy
+  constraints could be used if the constraints functions are difficult or
+  impossible to optimize (e.g. if they're piecewise constant), in which case the
+  proxy constraints should be some approximation of the original constraints
+  that is well-enough behaved to permit successful optimization.
+  """
+
+  @abc.abstractproperty
+  def objective(self):
+    """Returns the objective function.
+
+    Returns:
+      A 0d tensor that should be minimized.
+    """
+    pass
+
+  @property
+  def num_constraints(self):
+    """Returns the number of constraints.
+
+    Returns:
+      An int containing the number of constraints.
+
+    Raises:
+      ValueError: If the constraints (or proxy_constraints, if present) do not
+        have fully-known shapes, OR if proxy_constraints are present, and the
+        shapes of constraints and proxy_constraints are fully-known, but they're
+        different.
+    """
+    constraints_shape = self.constraints.get_shape()
+    if self.proxy_constraints is None:
+      proxy_constraints_shape = constraints_shape
+    else:
+      proxy_constraints_shape = self.proxy_constraints.get_shape()
+
+    if (constraints_shape is None or proxy_constraints_shape is None or
+        any([ii is None for ii in constraints_shape.as_list()]) or
+        any([ii is None for ii in proxy_constraints_shape.as_list()])):
+      raise ValueError(
+          "constraints and proxy_constraints must have fully-known shapes")
+    if constraints_shape != proxy_constraints_shape:
+      raise ValueError(
+          "constraints and proxy_constraints must have the same shape")
+
+    size = 1
+    for ii in constraints_shape.as_list():
+      size *= ii
+    return int(size)
+
+  @abc.abstractproperty
+  def constraints(self):
+    """Returns the vector of constraint functions.
+
+    Letting g_i be the ith element of the constraints vector, the ith constraint
+    will be g_i <= 0.
+
+    Returns:
+      A tensor of constraint functions.
+    """
+    pass
+
+  # This is a property, instead of an abstract property, since it doesn't need
+  # to be overridden: if proxy_constraints returns None, then there are no
+  # proxy constraints.
+  @property
+  def proxy_constraints(self):
+    """Returns the optional vector of proxy constraint functions.
+
+    The difference between `constraints` and `proxy_constraints` is that, when
+    proxy constraints are present, the `constraints` are merely EVALUATED during
+    optimization, whereas the `proxy_constraints` are DIFFERENTIATED. If there
+    are no proxy constraints, then the `constraints` are both evaluated and
+    differentiated.
+
+    For example, if we want to impose constraints on step functions, then we
+    could use these functions for `constraints`. However, because a step
+    function has zero gradient almost everywhere, we can't differentiate these
+    functions, so we would take `proxy_constraints` to be some differentiable
+    approximation of `constraints`.
+
+    Returns:
+      A tensor of proxy constraint functions.
+    """
+    return None
diff --git a/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py b/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py
new file mode 100644
index 0000000000..8055545366
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py
@@ -0,0 +1,208 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines base class for `ConstrainedOptimizer`s."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.training import optimizer as train_optimizer
+
+
+@six.add_metaclass(abc.ABCMeta)
+class ConstrainedOptimizer(object):
+  """Base class representing a constrained optimizer.
+
+  A ConstrainedOptimizer wraps a tf.train.Optimizer (or more than one), and
+  applies it to a ConstrainedMinimizationProblem. Unlike a tf.train.Optimizer,
+  which takes a tensor to minimize as a parameter to its minimize() method, a
+  constrained optimizer instead takes a ConstrainedMinimizationProblem.
+  """
+
+  def __init__(self, optimizer):
+    """Constructs a new `ConstrainedOptimizer`.
+
+    Args:
+      optimizer: tf.train.Optimizer, used to optimize the
+        ConstraintedMinimizationProblem.
+
+    Returns:
+      A new `ConstrainedOptimizer`.
+    """
+    self._optimizer = optimizer
+
+  @property
+  def optimizer(self):
+    """Returns the `tf.train.Optimizer` used for optimization."""
+    return self._optimizer
+
+  def minimize_unconstrained(self,
+                             minimization_problem,
+                             global_step=None,
+                             var_list=None,
+                             gate_gradients=train_optimizer.Optimizer.GATE_OP,
+                             aggregation_method=None,
+                             colocate_gradients_with_ops=False,
+                             name=None,
+                             grad_loss=None):
+    """Returns an `Op` for minimizing the unconstrained problem.
+
+    Unlike `minimize_constrained`, this function ignores the `constraints` (and
+    `proxy_constraints`) portion of the minimization problem entirely, and only
+    minimizes `objective`.
+
+    Args:
+      minimization_problem: ConstrainedMinimizationProblem, the problem to
+        optimize.
+      global_step: as in `tf.train.Optimizer`'s `minimize` method.
+      var_list: as in `tf.train.Optimizer`'s `minimize` method.
+      gate_gradients: as in `tf.train.Optimizer`'s `minimize` method.
+      aggregation_method: as in `tf.train.Optimizer`'s `minimize` method.
+      colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize`
+        method.
+      name: as in `tf.train.Optimizer`'s `minimize` method.
+      grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
+
+    Returns:
+      TensorFlow Op.
+    """
+    return self.optimizer.minimize(
+        minimization_problem.objective,
+        global_step=global_step,
+        var_list=var_list,
+        gate_gradients=gate_gradients,
+        aggregation_method=aggregation_method,
+        colocate_gradients_with_ops=colocate_gradients_with_ops,
+        name=name,
+        grad_loss=grad_loss)
+
+  @abc.abstractmethod
+  def minimize_constrained(self,
+                           minimization_problem,
+                           global_step=None,
+                           var_list=None,
+                           gate_gradients=train_optimizer.Optimizer.GATE_OP,
+                           aggregation_method=None,
+                           colocate_gradients_with_ops=False,
+                           name=None,
+                           grad_loss=None):
+    """Returns an `Op` for minimizing the constrained problem.
+
+    Unlike `minimize_unconstrained`, this function attempts to find a solution
+    that minimizes the `objective` portion of the minimization problem while
+    satisfying the `constraints` portion.
+
+    Args:
+      minimization_problem: ConstrainedMinimizationProblem, the problem to
+        optimize.
+      global_step: as in `tf.train.Optimizer`'s `minimize` method.
+      var_list: as in `tf.train.Optimizer`'s `minimize` method.
+      gate_gradients: as in `tf.train.Optimizer`'s `minimize` method.
+      aggregation_method: as in `tf.train.Optimizer`'s `minimize` method.
+      colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize`
+        method.
+      name: as in `tf.train.Optimizer`'s `minimize` method.
+      grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
+
+    Returns:
+      TensorFlow Op.
+    """
+    pass
+
+  def minimize(self,
+               minimization_problem,
+               unconstrained_steps=None,
+               global_step=None,
+               var_list=None,
+               gate_gradients=train_optimizer.Optimizer.GATE_OP,
+               aggregation_method=None,
+               colocate_gradients_with_ops=False,
+               name=None,
+               grad_loss=None):
+    """Returns an `Op` for minimizing the constrained problem.
+
+    This method combines the functionality of `minimize_unconstrained` and
+    `minimize_constrained`. If global_step < unconstrained_steps, it will
+    perform an unconstrained update, and if global_step >= unconstrained_steps,
+    it will perform a constrained update.
+
+    The reason for this functionality is that it may be best to initialize the
+    constrained optimizer with an approximate optimum of the unconstrained
+    problem.
+
+    Args:
+      minimization_problem: ConstrainedMinimizationProblem, the problem to
+        optimize.
+      unconstrained_steps: int, number of steps for which we should perform
+        unconstrained updates, before transitioning to constrained updates.
+      global_step: as in `tf.train.Optimizer`'s `minimize` method.
+      var_list: as in `tf.train.Optimizer`'s `minimize` method.
+      gate_gradients: as in `tf.train.Optimizer`'s `minimize` method.
+      aggregation_method: as in `tf.train.Optimizer`'s `minimize` method.
+      colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize`
+        method.
+      name: as in `tf.train.Optimizer`'s `minimize` method.
+      grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
+
+    Returns:
+      TensorFlow Op.
+
+    Raises:
+      ValueError: If unconstrained_steps is provided, but global_step is not.
+    """
+
+    def unconstrained_fn():
+      """Returns an `Op` for minimizing the unconstrained problem."""
+      return self.minimize_unconstrained(
+          minimization_problem=minimization_problem,
+          global_step=global_step,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          name=name,
+          grad_loss=grad_loss)
+
+    def constrained_fn():
+      """Returns an `Op` for minimizing the constrained problem."""
+      return self.minimize_constrained(
+          minimization_problem=minimization_problem,
+          global_step=global_step,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          name=name,
+          grad_loss=grad_loss)
+
+    if unconstrained_steps is not None:
+      if global_step is None:
+        raise ValueError(
+            "global_step cannot be None if unconstrained_steps is provided")
+      unconstrained_steps_tensor = ops.convert_to_tensor(unconstrained_steps)
+      dtype = unconstrained_steps_tensor.dtype
+      return control_flow_ops.cond(
+          standard_ops.cast(global_step, dtype) < unconstrained_steps_tensor,
+          true_fn=unconstrained_fn,
+          false_fn=constrained_fn)
+    else:
+      return constrained_fn()
diff --git a/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py
new file mode 100644
index 0000000000..01c6e4f08a
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py
@@ -0,0 +1,375 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines `AdditiveExternalRegretOptimizer`.
+
+This optimizer minimizes a `ConstrainedMinimizationProblem` by introducing
+Lagrange multipliers, and using `tf.train.Optimizer`s to jointly optimize over
+the model parameters and Lagrange multipliers.
+
+For the purposes of constrained optimization, at least in theory,
+external-regret minimization suffices if the `ConstrainedMinimizationProblem`
+we're optimizing doesn't have any `proxy_constraints`, while swap-regret
+minimization should be used if `proxy_constraints` are present.
+
+For more specifics, please refer to:
+
+> Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+> Constrained Optimization".
+> [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+The formulation used by the AdditiveExternalRegretOptimizer--which is simply the
+usual Lagrangian formulation--can be found in Definition 1, and is discussed in
+Section 3. This optimizer is most similar to Algorithm 3 in Appendix C.3, with
+the two differences being that it uses proxy constraints (if they're provided)
+in the update of the model parameters, and uses `tf.train.Optimizer`s, instead
+of SGD, for the "inner" updates.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.contrib.constrained_optimization.python import constrained_optimizer
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import optimizer as train_optimizer
+
+
+def _project_multipliers_wrt_euclidean_norm(multipliers, radius):
+  """Projects its argument onto the feasible region.
+
+  The feasible region is the set of all vectors with nonnegative elements that
+  sum to at most `radius`.
+
+  Args:
+    multipliers: 1d tensor, the Lagrange multipliers to project.
+    radius: float, the radius of the feasible region.
+
+  Returns:
+    The 1d tensor that results from projecting `multipliers` onto the feasible
+      region w.r.t. the Euclidean norm.
+
+  Raises:
+    ValueError: if the `multipliers` tensor does not have a fully-known shape,
+      or is not one-dimensional.
+  """
+  multipliers_shape = multipliers.get_shape()
+  if multipliers_shape is None:
+    raise ValueError("multipliers must have known shape")
+  if multipliers_shape.ndims != 1:
+    raise ValueError(
+        "multipliers must be one dimensional (instead is %d-dimensional)" %
+        multipliers_shape.ndims)
+  dimension = multipliers_shape[0].value
+  if dimension is None:
+    raise ValueError("multipliers must have fully-known shape")
+
+  def while_loop_condition(iteration, multipliers, inactive, old_inactive):
+    """Returns false if the while loop should terminate."""
+    del multipliers  # Needed by the body, but not the condition.
+    not_done = (iteration < dimension)
+    not_converged = standard_ops.reduce_any(
+        standard_ops.not_equal(inactive, old_inactive))
+    return standard_ops.logical_and(not_done, not_converged)
+
+  def while_loop_body(iteration, multipliers, inactive, old_inactive):
+    """Performs one iteration of the projection."""
+    del old_inactive  # Needed by the condition, but not the body.
+    iteration += 1
+    scale = standard_ops.minimum(
+        0.0,
+        (radius - standard_ops.reduce_sum(multipliers)) / standard_ops.maximum(
+            1.0, standard_ops.reduce_sum(inactive)))
+    multipliers += scale * inactive
+    new_inactive = standard_ops.to_float(multipliers > 0)
+    multipliers *= new_inactive
+    return (iteration, multipliers, new_inactive, inactive)
+
+  iteration = standard_ops.constant(0)
+  inactive = standard_ops.ones_like(multipliers)
+
+  # We actually want a do-while loop, so we explicitly call while_loop_body()
+  # once before tf.while_loop().
+  iteration, multipliers, inactive, old_inactive = while_loop_body(
+      iteration, multipliers, inactive, inactive)
+  iteration, multipliers, inactive, old_inactive = control_flow_ops.while_loop(
+      while_loop_condition,
+      while_loop_body,
+      loop_vars=(iteration, multipliers, inactive, old_inactive),
+      name="euclidean_projection")
+
+  return multipliers
+
+
+@six.add_metaclass(abc.ABCMeta)
+class _ExternalRegretOptimizer(constrained_optimizer.ConstrainedOptimizer):
+  """Base class representing an `_ExternalRegretOptimizer`.
+
+  This class contains most of the logic for performing constrained
+  optimization, minimizing external regret for the constraints player. What it
+  *doesn't* do is keep track of the internal state (the Lagrange multipliers).
+  Instead, the state is accessed via the _initial_state(),
+  _lagrange_multipliers(), _constraint_grad_and_var() and _projection_op()
+  methods.
+
+  The reason for this is that we want to make it easy to implement different
+  representations of the internal state.
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  The formulation used by `_ExternalRegretOptimizer`s--which is simply the usual
+  Lagrangian formulation--can be found in Definition 1, and is discussed in
+  Section 3. Such optimizers are most similar to Algorithm 3 in Appendix C.3.
+  """
+
+  def __init__(self, optimizer, constraint_optimizer=None):
+    """Constructs a new `_ExternalRegretOptimizer`.
+
+    The difference between `optimizer` and `constraint_optimizer` (if the latter
+    is provided) is that the former is used for learning the model parameters,
+    while the latter us used for the Lagrange multipliers. If no
+    `constraint_optimizer` is provided, then `optimizer` is used for both.
+
+    Args:
+      optimizer: tf.train.Optimizer, used to optimize the objective and
+        proxy_constraints portion of the ConstrainedMinimizationProblem. If
+        constraint_optimizer is not provided, this will also be used to optimize
+        the Lagrange multipliers.
+      constraint_optimizer: optional tf.train.Optimizer, used to optimize the
+        Lagrange multipliers.
+
+    Returns:
+      A new `_ExternalRegretOptimizer`.
+    """
+    super(_ExternalRegretOptimizer, self).__init__(optimizer=optimizer)
+    self._constraint_optimizer = constraint_optimizer
+
+  @property
+  def constraint_optimizer(self):
+    """Returns the `tf.train.Optimizer` used for the Lagrange multipliers."""
+    return self._constraint_optimizer
+
+  @abc.abstractmethod
+  def _initial_state(self, num_constraints):
+    pass
+
+  @abc.abstractmethod
+  def _lagrange_multipliers(self, state):
+    pass
+
+  @abc.abstractmethod
+  def _constraint_grad_and_var(self, state, gradient):
+    pass
+
+  @abc.abstractmethod
+  def _projection_op(self, state, name=None):
+    pass
+
+  def minimize_constrained(self,
+                           minimization_problem,
+                           global_step=None,
+                           var_list=None,
+                           gate_gradients=train_optimizer.Optimizer.GATE_OP,
+                           aggregation_method=None,
+                           colocate_gradients_with_ops=False,
+                           name=None,
+                           grad_loss=None):
+    """Returns an `Op` for minimizing the constrained problem.
+
+    The `optimizer` constructor parameter will be used to update the model
+    parameters, while the Lagrange multipliers will be updated using
+    `constrained_optimizer` (if provided) or `optimizer` (if not).
+
+    Args:
+      minimization_problem: ConstrainedMinimizationProblem, the problem to
+        optimize.
+      global_step: as in `tf.train.Optimizer`'s `minimize` method.
+      var_list: as in `tf.train.Optimizer`'s `minimize` method.
+      gate_gradients: as in `tf.train.Optimizer`'s `minimize` method.
+      aggregation_method: as in `tf.train.Optimizer`'s `minimize` method.
+      colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize`
+        method.
+      name: as in `tf.train.Optimizer`'s `minimize` method.
+      grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
+
+    Returns:
+      TensorFlow Op.
+    """
+    objective = minimization_problem.objective
+
+    constraints = minimization_problem.constraints
+    proxy_constraints = minimization_problem.proxy_constraints
+    if proxy_constraints is None:
+      proxy_constraints = constraints
+    # Flatten both constraints tensors to 1d.
+    num_constraints = minimization_problem.num_constraints
+    constraints = standard_ops.reshape(constraints, shape=(num_constraints,))
+    proxy_constraints = standard_ops.reshape(
+        proxy_constraints, shape=(num_constraints,))
+
+    # We use a lambda to initialize the state so that, if this function call is
+    # inside the scope of a tf.control_dependencies() block, the dependencies
+    # will not be applied to the initializer.
+    state = standard_ops.Variable(
+        lambda: self._initial_state(num_constraints),
+        trainable=False,
+        name="external_regret_optimizer_state")
+
+    multipliers = self._lagrange_multipliers(state)
+    loss = (
+        objective + standard_ops.tensordot(multipliers, proxy_constraints, 1))
+    multipliers_gradient = constraints
+
+    update_ops = []
+    if self.constraint_optimizer is None:
+      # If we don't have a separate constraint_optimizer, then we use
+      # self._optimizer for both the update of the model parameters, and that of
+      # the internal state.
+      grads_and_vars = self.optimizer.compute_gradients(
+          loss,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          grad_loss=grad_loss)
+      grads_and_vars.append(
+          self._constraint_grad_and_var(state, multipliers_gradient))
+      update_ops.append(
+          self.optimizer.apply_gradients(grads_and_vars, name="update"))
+    else:
+      # If we have a separate constraint_optimizer, then we use self._optimizer
+      # for the update of the model parameters, and self._constraint_optimizer
+      # for that of the internal state.
+      grads_and_vars = self.optimizer.compute_gradients(
+          loss,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          grad_loss=grad_loss)
+      multiplier_grads_and_vars = [
+          self._constraint_grad_and_var(state, multipliers_gradient)
+      ]
+
+      gradients = [
+          gradient for gradient, _ in grads_and_vars + multiplier_grads_and_vars
+          if gradient is not None
+      ]
+      with ops.control_dependencies(gradients):
+        update_ops.append(
+            self.optimizer.apply_gradients(grads_and_vars, name="update"))
+        update_ops.append(
+            self.constraint_optimizer.apply_gradients(
+                multiplier_grads_and_vars, name="optimizer_state_update"))
+
+    with ops.control_dependencies(update_ops):
+      if global_step is None:
+        # If we don't have a global step, just project, and we're done.
+        return self._projection_op(state, name=name)
+      else:
+        # If we have a global step, then we need to increment it in addition to
+        # projecting.
+        projection_op = self._projection_op(state, name="project")
+        with ops.colocate_with(global_step):
+          global_step_op = state_ops.assign_add(
+              global_step, 1, name="global_step_increment")
+        return control_flow_ops.group(projection_op, global_step_op, name=name)
+
+
+class AdditiveExternalRegretOptimizer(_ExternalRegretOptimizer):
+  """A `ConstrainedOptimizer` based on external-regret minimization.
+
+  This `ConstrainedOptimizer` uses the given `tf.train.Optimizer`s to jointly
+  minimize over the model parameters, and maximize over Lagrange multipliers,
+  with the latter maximization using additive updates and an algorithm that
+  minimizes external regret.
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  The formulation used by this optimizer--which is simply the usual Lagrangian
+  formulation--can be found in Definition 1, and is discussed in Section 3. It
+  is most similar to Algorithm 3 in Appendix C.3, with the two differences being
+  that it uses proxy constraints (if they're provided) in the update of the
+  model parameters, and uses `tf.train.Optimizer`s, instead of SGD, for the
+  "inner" updates.
+  """
+
+  def __init__(self,
+               optimizer,
+               constraint_optimizer=None,
+               maximum_multiplier_radius=None):
+    """Constructs a new `AdditiveExternalRegretOptimizer`.
+
+    Args:
+      optimizer: tf.train.Optimizer, used to optimize the objective and
+        proxy_constraints portion of ConstrainedMinimizationProblem. If
+        constraint_optimizer is not provided, this will also be used to optimize
+        the Lagrange multipliers.
+      constraint_optimizer: optional tf.train.Optimizer, used to optimize the
+        Lagrange multipliers.
+      maximum_multiplier_radius: float, an optional upper bound to impose on the
+        sum of the Lagrange multipliers.
+
+    Returns:
+      A new `AdditiveExternalRegretOptimizer`.
+
+    Raises:
+      ValueError: If the maximum_multiplier_radius parameter is nonpositive.
+    """
+    super(AdditiveExternalRegretOptimizer, self).__init__(
+        optimizer=optimizer, constraint_optimizer=constraint_optimizer)
+
+    if maximum_multiplier_radius and (maximum_multiplier_radius <= 0.0):
+      raise ValueError("maximum_multiplier_radius must be strictly positive")
+
+    self._maximum_multiplier_radius = maximum_multiplier_radius
+
+  def _initial_state(self, num_constraints):
+    # For an AdditiveExternalRegretOptimizer, the internal state is simply a
+    # tensor of Lagrange multipliers with shape (m,), where m is the number of
+    # constraints.
+    return standard_ops.zeros((num_constraints,), dtype=dtypes.float32)
+
+  def _lagrange_multipliers(self, state):
+    return state
+
+  def _constraint_grad_and_var(self, state, gradient):
+    # TODO(acotter): tf.colocate_with(), if colocate_gradients_with_ops is True?
+    return (-gradient, state)
+
+  def _projection_op(self, state, name=None):
+    with ops.colocate_with(state):
+      if self._maximum_multiplier_radius:
+        projected_multipliers = _project_multipliers_wrt_euclidean_norm(
+            state, self._maximum_multiplier_radius)
+      else:
+        projected_multipliers = standard_ops.maximum(state, 0.0)
+      return state_ops.assign(state, projected_multipliers, name=name)
diff --git a/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer_test.py b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer_test.py
new file mode 100644
index 0000000000..9b4bf62710
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer_test.py
@@ -0,0 +1,136 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for constrained_optimization.python.external_regret_optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.constrained_optimization.python import external_regret_optimizer
+from tensorflow.contrib.constrained_optimization.python import test_util
+
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+
+
+class AdditiveExternalRegretOptimizerWrapper(
+    external_regret_optimizer.AdditiveExternalRegretOptimizer):
+  """Testing wrapper class around AdditiveExternalRegretOptimizer.
+
+  This class is identical to AdditiveExternalRegretOptimizer, except that it
+  caches the internal optimization state when _lagrange_multipliers() is called,
+  so that we can test that the Lagrange multipliers take on their expected
+  values.
+  """
+
+  def __init__(self,
+               optimizer,
+               constraint_optimizer=None,
+               maximum_multiplier_radius=None):
+    """Same as AdditiveExternalRegretOptimizer.__init__."""
+    super(AdditiveExternalRegretOptimizerWrapper, self).__init__(
+        optimizer=optimizer,
+        constraint_optimizer=constraint_optimizer,
+        maximum_multiplier_radius=maximum_multiplier_radius)
+    self._cached_lagrange_multipliers = None
+
+  @property
+  def lagrange_multipliers(self):
+    """Returns the cached Lagrange multipliers."""
+    return self._cached_lagrange_multipliers
+
+  def _lagrange_multipliers(self, state):
+    """Caches the internal state for testing."""
+    self._cached_lagrange_multipliers = super(
+        AdditiveExternalRegretOptimizerWrapper,
+        self)._lagrange_multipliers(state)
+    return self._cached_lagrange_multipliers
+
+
+class ExternalRegretOptimizerTest(test.TestCase):
+
+  def test_project_multipliers_wrt_euclidean_norm(self):
+    """Tests Euclidean projection routine on some known values."""
+    multipliers1 = standard_ops.constant([-0.1, -0.6, -0.3])
+    expected_projected_multipliers1 = np.array([0.0, 0.0, 0.0])
+
+    multipliers2 = standard_ops.constant([-0.1, 0.6, 0.3])
+    expected_projected_multipliers2 = np.array([0.0, 0.6, 0.3])
+
+    multipliers3 = standard_ops.constant([0.4, 0.7, -0.2, 0.5, 0.1])
+    expected_projected_multipliers3 = np.array([0.2, 0.5, 0.0, 0.3, 0.0])
+
+    with self.test_session() as session:
+      projected_multipliers1 = session.run(
+          external_regret_optimizer._project_multipliers_wrt_euclidean_norm(
+              multipliers1, 1.0))
+      projected_multipliers2 = session.run(
+          external_regret_optimizer._project_multipliers_wrt_euclidean_norm(
+              multipliers2, 1.0))
+      projected_multipliers3 = session.run(
+          external_regret_optimizer._project_multipliers_wrt_euclidean_norm(
+              multipliers3, 1.0))
+
+    self.assertAllClose(
+        expected_projected_multipliers1,
+        projected_multipliers1,
+        rtol=0,
+        atol=1e-6)
+    self.assertAllClose(
+        expected_projected_multipliers2,
+        projected_multipliers2,
+        rtol=0,
+        atol=1e-6)
+    self.assertAllClose(
+        expected_projected_multipliers3,
+        projected_multipliers3,
+        rtol=0,
+        atol=1e-6)
+
+  def test_additive_external_regret_optimizer(self):
+    """Tests that the Lagrange multipliers update as expected."""
+    minimization_problem = test_util.ConstantMinimizationProblem(
+        np.array([0.6, -0.1, 0.4]))
+    optimizer = AdditiveExternalRegretOptimizerWrapper(
+        gradient_descent.GradientDescentOptimizer(1.0),
+        maximum_multiplier_radius=1.0)
+    train_op = optimizer.minimize_constrained(minimization_problem)
+
+    expected_multipliers = [
+        np.array([0.0, 0.0, 0.0]),
+        np.array([0.6, 0.0, 0.4]),
+        np.array([0.7, 0.0, 0.3]),
+        np.array([0.8, 0.0, 0.2]),
+        np.array([0.9, 0.0, 0.1]),
+        np.array([1.0, 0.0, 0.0]),
+        np.array([1.0, 0.0, 0.0]),
+    ]
+
+    multipliers = []
+    with self.test_session() as session:
+      session.run(standard_ops.global_variables_initializer())
+      while len(multipliers) < len(expected_multipliers):
+        multipliers.append(session.run(optimizer.lagrange_multipliers))
+        session.run(train_op)
+
+    for expected, actual in zip(expected_multipliers, multipliers):
+      self.assertAllClose(expected, actual, rtol=0, atol=1e-6)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
new file mode 100644
index 0000000000..04014ab4ae
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
@@ -0,0 +1,595 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines `{Additive,Multiplicative}SwapRegretOptimizer`s.
+
+These optimizers minimize a `ConstrainedMinimizationProblem` by using a
+swap-regret minimizing algorithm (either SGD or multiplicative weights) to learn
+what weights should be associated with the objective function and constraints.
+These algorithms do *not* use Lagrange multipliers, but the idea is similar.
+The main differences between the formulation used here, and the standard
+Lagrangian formulation, are that (i) the objective function is weighted, in
+addition to the constraints, and (ii) we learn a matrix of weights, instead of a
+vector.
+
+For the purposes of constrained optimization, at least in theory,
+external-regret minimization suffices if the `ConstrainedMinimizationProblem`
+we're optimizing doesn't have any `proxy_constraints`, while swap-regret
+minimization should be used if `proxy_constraints` are present.
+
+For more specifics, please refer to:
+
+> Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+> Constrained Optimization".
+> [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+The formulation used by both of the SwapRegretOptimizers can be found in
+Definition 2, and is discussed in Section 4. The
+`MultiplicativeSwapRegretOptimizer` is most similar to Algorithm 2 in Section 4,
+with the difference being that it uses `tf.train.Optimizer`s, instead of SGD,
+for the "inner" updates. The `AdditiveSwapRegretOptimizer` differs further in
+that it performs additive (instead of multiplicative) updates of the stochastic
+matrix.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import math
+
+import six
+
+from tensorflow.contrib.constrained_optimization.python import constrained_optimizer
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import optimizer as train_optimizer
+
+
+def _maximal_eigenvector_power_method(matrix,
+                                      epsilon=1e-6,
+                                      maximum_iterations=100):
+  """Returns the maximal right-eigenvector of `matrix` using the power method.
+
+  Args:
+    matrix: 2D Tensor, the matrix of which we will find the maximal
+      right-eigenvector.
+    epsilon: nonnegative float, if two iterations of the power method differ (in
+      L2 norm) by no more than epsilon, we will terminate.
+    maximum_iterations: nonnegative int, if we perform this many iterations, we
+      will terminate.
+
+  Result:
+    The maximal right-eigenvector of `matrix`.
+
+  Raises:
+    ValueError: If the epsilon or maximum_iterations parameters violate their
+      bounds.
+  """
+  if epsilon <= 0.0:
+    raise ValueError("epsilon must be strictly positive")
+  if maximum_iterations <= 0:
+    raise ValueError("maximum_iterations must be strictly positive")
+
+  def while_loop_condition(iteration, eigenvector, old_eigenvector):
+    """Returns false if the while loop should terminate."""
+    not_done = (iteration < maximum_iterations)
+    not_converged = (standard_ops.norm(eigenvector - old_eigenvector) > epsilon)
+    return standard_ops.logical_and(not_done, not_converged)
+
+  def while_loop_body(iteration, eigenvector, old_eigenvector):
+    """Performs one iteration of the power method."""
+    del old_eigenvector  # Needed by the condition, but not the body.
+    iteration += 1
+    # We need to use tf.matmul() and tf.expand_dims(), instead of
+    # tf.tensordot(), since the former will infer the shape of the result, while
+    # the latter will not (tf.while_loop() needs the shapes).
+    new_eigenvector = standard_ops.matmul(
+        matrix, standard_ops.expand_dims(eigenvector, 1))[:, 0]
+    new_eigenvector /= standard_ops.norm(new_eigenvector)
+    return (iteration, new_eigenvector, eigenvector)
+
+  iteration = standard_ops.constant(0)
+  eigenvector = standard_ops.ones_like(matrix[:, 0])
+  eigenvector /= standard_ops.norm(eigenvector)
+
+  # We actually want a do-while loop, so we explicitly call while_loop_body()
+  # once before tf.while_loop().
+  iteration, eigenvector, old_eigenvector = while_loop_body(
+      iteration, eigenvector, eigenvector)
+  iteration, eigenvector, old_eigenvector = control_flow_ops.while_loop(
+      while_loop_condition,
+      while_loop_body,
+      loop_vars=(iteration, eigenvector, old_eigenvector),
+      name="power_method")
+
+  return eigenvector
+
+
+def _project_stochastic_matrix_wrt_euclidean_norm(matrix):
+  """Projects its argument onto the set of left-stochastic matrices.
+
+  This algorithm is O(n^3) at worst, where `matrix` is n*n. It can be done in
+  O(n^2 * log(n)) time by sorting each column (and maybe better with a different
+  algorithm), but the algorithm implemented here is easier to implement in
+  TensorFlow.
+
+  Args:
+    matrix: 2d square tensor, the matrix to project.
+
+  Returns:
+    The 2d square tensor that results from projecting `matrix` onto the set of
+      left-stochastic matrices w.r.t. the Euclidean norm applied column-wise
+      (i.e. the Frobenius norm).
+
+  Raises:
+    ValueError: if the `matrix` tensor does not have a fully-known shape, or is
+      not two-dimensional and square.
+  """
+  matrix_shape = matrix.get_shape()
+  if matrix_shape is None:
+    raise ValueError("matrix must have known shape")
+  if matrix_shape.ndims != 2:
+    raise ValueError(
+        "matrix must be two dimensional (instead is %d-dimensional)" %
+        matrix_shape.ndims)
+  if matrix_shape[0] != matrix_shape[1]:
+    raise ValueError("matrix must be be square (instead has shape (%d,%d))" %
+                     (matrix_shape[0], matrix_shape[1]))
+  dimension = matrix_shape[0].value
+  if dimension is None:
+    raise ValueError("matrix must have fully-known shape")
+
+  def while_loop_condition(iteration, matrix, inactive, old_inactive):
+    """Returns false if the while loop should terminate."""
+    del matrix  # Needed by the body, but not the condition.
+    not_done = (iteration < dimension)
+    not_converged = standard_ops.reduce_any(
+        standard_ops.not_equal(inactive, old_inactive))
+    return standard_ops.logical_and(not_done, not_converged)
+
+  def while_loop_body(iteration, matrix, inactive, old_inactive):
+    """Performs one iteration of the projection."""
+    del old_inactive  # Needed by the condition, but not the body.
+    iteration += 1
+    scale = (1.0 - standard_ops.reduce_sum(
+        matrix, axis=0, keep_dims=True)) / standard_ops.maximum(
+            1.0, standard_ops.reduce_sum(inactive, axis=0, keep_dims=True))
+    matrix += scale * inactive
+    new_inactive = standard_ops.to_float(matrix > 0)
+    matrix *= new_inactive
+    return (iteration, matrix, new_inactive, inactive)
+
+  iteration = standard_ops.constant(0)
+  inactive = standard_ops.ones_like(matrix)
+
+  # We actually want a do-while loop, so we explicitly call while_loop_body()
+  # once before tf.while_loop().
+  iteration, matrix, inactive, old_inactive = while_loop_body(
+      iteration, matrix, inactive, inactive)
+  iteration, matrix, inactive, old_inactive = control_flow_ops.while_loop(
+      while_loop_condition,
+      while_loop_body,
+      loop_vars=(iteration, matrix, inactive, old_inactive),
+      name="euclidean_projection")
+
+  return matrix
+
+
+def _project_log_stochastic_matrix_wrt_kl_divergence(log_matrix):
+  """Projects its argument onto the set of log-left-stochastic matrices.
+
+  Args:
+    log_matrix: 2d square tensor, the element-wise logarithm of the matrix to
+      project.
+
+  Returns:
+    The 2d square tensor that results from projecting exp(`matrix`) onto the set
+      of left-stochastic matrices w.r.t. the KL-divergence applied column-wise.
+  """
+
+  # For numerical reasons, make sure that the largest matrix element is zero
+  # before exponentiating.
+  log_matrix -= standard_ops.reduce_max(log_matrix, axis=0, keep_dims=True)
+  log_matrix -= standard_ops.log(
+      standard_ops.reduce_sum(
+          standard_ops.exp(log_matrix), axis=0, keep_dims=True))
+  return log_matrix
+
+
+@six.add_metaclass(abc.ABCMeta)
+class _SwapRegretOptimizer(constrained_optimizer.ConstrainedOptimizer):
+  """Base class representing a `_SwapRegretOptimizer`.
+
+  This class contains most of the logic for performing constrained optimization,
+  minimizing external regret for the constraints player. What it *doesn't* do is
+  keep track of the internal state (the stochastic matrix).  Instead, the state
+  is accessed via the _initial_state(), _stochastic_matrix(),
+  _constraint_grad_and_var() and _projection_op() methods.
+
+  The reason for this is that we want to make it easy to implement different
+  representations of the internal state. For example, for additive updates, it's
+  most natural to store the stochastic matrix directly, whereas for
+  multiplicative updates, it's most natural to store its element-wise logarithm.
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  The formulation used by `_SwapRegretOptimizer`s can be found in Definition 2,
+  and is discussed in Section 4. Such optimizers are most similar to Algorithm
+  2 in Section 4. Most notably, the internal state is a left-stochastic matrix
+  of shape (m+1,m+1), where m is the number of constraints.
+  """
+
+  def __init__(self, optimizer, constraint_optimizer=None):
+    """Constructs a new `_SwapRegretOptimizer`.
+
+    The difference between `optimizer` and `constraint_optimizer` (if the latter
+    is provided) is that the former is used for learning the model parameters,
+    while the latter us used for the update to the constraint/objective weight
+    matrix (the analogue of Lagrange multipliers). If no `constraint_optimizer`
+    is provided, then `optimizer` is used for both.
+
+    Args:
+      optimizer: tf.train.Optimizer, used to optimize the objective and
+        proxy_constraints portion of ConstrainedMinimizationProblem. If
+        constraint_optimizer is not provided, this will also be used to optimize
+        the Lagrange multiplier analogues.
+      constraint_optimizer: optional tf.train.Optimizer, used to optimize the
+        Lagrange multiplier analogues.
+
+    Returns:
+      A new `_SwapRegretOptimizer`.
+    """
+    super(_SwapRegretOptimizer, self).__init__(optimizer=optimizer)
+    self._constraint_optimizer = constraint_optimizer
+
+  @property
+  def constraint_optimizer(self):
+    """Returns the `tf.train.Optimizer` used for the matrix."""
+    return self._constraint_optimizer
+
+  @abc.abstractmethod
+  def _initial_state(self, num_constraints):
+    pass
+
+  @abc.abstractmethod
+  def _stochastic_matrix(self, state):
+    pass
+
+  def _distribution(self, state):
+    distribution = _maximal_eigenvector_power_method(
+        self._stochastic_matrix(state))
+    distribution = standard_ops.abs(distribution)
+    distribution /= standard_ops.reduce_sum(distribution)
+    return distribution
+
+  @abc.abstractmethod
+  def _constraint_grad_and_var(self, state, gradient):
+    pass
+
+  @abc.abstractmethod
+  def _projection_op(self, state, name=None):
+    pass
+
+  def minimize_constrained(self,
+                           minimization_problem,
+                           global_step=None,
+                           var_list=None,
+                           gate_gradients=train_optimizer.Optimizer.GATE_OP,
+                           aggregation_method=None,
+                           colocate_gradients_with_ops=False,
+                           name=None,
+                           grad_loss=None):
+    """Returns an `Op` for minimizing the constrained problem.
+
+    The `optimizer` constructor parameter will be used to update the model
+    parameters, while the constraint/objective weight matrix (the analogue of
+    Lagrange multipliers) will be updated using `constrained_optimizer` (if
+    provided) or `optimizer` (if not). Whether the matrix updates are additive
+    or multiplicative depends on the derived class.
+
+    Args:
+      minimization_problem: ConstrainedMinimizationProblem, the problem to
+        optimize.
+      global_step: as in `tf.train.Optimizer`'s `minimize` method.
+      var_list: as in `tf.train.Optimizer`'s `minimize` method.
+      gate_gradients: as in `tf.train.Optimizer`'s `minimize` method.
+      aggregation_method: as in `tf.train.Optimizer`'s `minimize` method.
+      colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize`
+        method.
+      name: as in `tf.train.Optimizer`'s `minimize` method.
+      grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
+
+    Returns:
+      TensorFlow Op.
+    """
+    objective = minimization_problem.objective
+
+    constraints = minimization_problem.constraints
+    proxy_constraints = minimization_problem.proxy_constraints
+    if proxy_constraints is None:
+      proxy_constraints = constraints
+    # Flatten both constraints tensors to 1d.
+    num_constraints = minimization_problem.num_constraints
+    constraints = standard_ops.reshape(constraints, shape=(num_constraints,))
+    proxy_constraints = standard_ops.reshape(
+        proxy_constraints, shape=(num_constraints,))
+
+    # We use a lambda to initialize the state so that, if this function call is
+    # inside the scope of a tf.control_dependencies() block, the dependencies
+    # will not be applied to the initializer.
+    state = standard_ops.Variable(
+        lambda: self._initial_state(num_constraints),
+        trainable=False,
+        name="swap_regret_optimizer_state")
+
+    zero_and_constraints = standard_ops.concat(
+        (standard_ops.zeros((1,)), constraints), axis=0)
+    objective_and_proxy_constraints = standard_ops.concat(
+        (standard_ops.expand_dims(objective, 0), proxy_constraints), axis=0)
+
+    distribution = self._distribution(state)
+    loss = standard_ops.tensordot(distribution, objective_and_proxy_constraints,
+                                  1)
+    matrix_gradient = standard_ops.matmul(
+        standard_ops.expand_dims(zero_and_constraints, 1),
+        standard_ops.expand_dims(distribution, 0))
+
+    update_ops = []
+    if self.constraint_optimizer is None:
+      # If we don't have a separate constraint_optimizer, then we use
+      # self._optimizer for both the update of the model parameters, and that of
+      # the internal state.
+      grads_and_vars = self.optimizer.compute_gradients(
+          loss,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          grad_loss=grad_loss)
+      grads_and_vars.append(
+          self._constraint_grad_and_var(state, matrix_gradient))
+      update_ops.append(
+          self.optimizer.apply_gradients(grads_and_vars, name="update"))
+    else:
+      # If we have a separate constraint_optimizer, then we use self._optimizer
+      # for the update of the model parameters, and self._constraint_optimizer
+      # for that of the internal state.
+      grads_and_vars = self.optimizer.compute_gradients(
+          loss,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          grad_loss=grad_loss)
+      matrix_grads_and_vars = [
+          self._constraint_grad_and_var(state, matrix_gradient)
+      ]
+
+      gradients = [
+          gradient for gradient, _ in grads_and_vars + matrix_grads_and_vars
+          if gradient is not None
+      ]
+      with ops.control_dependencies(gradients):
+        update_ops.append(
+            self.optimizer.apply_gradients(grads_and_vars, name="update"))
+        update_ops.append(
+            self.constraint_optimizer.apply_gradients(
+                matrix_grads_and_vars, name="optimizer_state_update"))
+
+    with ops.control_dependencies(update_ops):
+      if global_step is None:
+        # If we don't have a global step, just project, and we're done.
+        return self._projection_op(state, name=name)
+      else:
+        # If we have a global step, then we need to increment it in addition to
+        # projecting.
+        projection_op = self._projection_op(state, name="project")
+        with ops.colocate_with(global_step):
+          global_step_op = state_ops.assign_add(
+              global_step, 1, name="global_step_increment")
+        return control_flow_ops.group(projection_op, global_step_op, name=name)
+
+
+class AdditiveSwapRegretOptimizer(_SwapRegretOptimizer):
+  """A `ConstrainedOptimizer` based on swap-regret minimization.
+
+  This `ConstrainedOptimizer` uses the given `tf.train.Optimizer`s to jointly
+  minimize over the model parameters, and maximize over constraint/objective
+  weight matrix (the analogue of Lagrange multipliers), with the latter
+  maximization using additive updates and an algorithm that minimizes swap
+  regret.
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  The formulation used by this optimizer can be found in Definition 2, and is
+  discussed in Section 4. It is most similar to Algorithm 2 in Section 4, with
+  the differences being that it uses `tf.train.Optimizer`s, instead of SGD, for
+  the "inner" updates, and performs additive (instead of multiplicative) updates
+  of the stochastic matrix.
+  """
+
+  def __init__(self, optimizer, constraint_optimizer=None):
+    """Constructs a new `AdditiveSwapRegretOptimizer`.
+
+    Args:
+      optimizer: tf.train.Optimizer, used to optimize the objective and
+        proxy_constraints portion of ConstrainedMinimizationProblem. If
+        constraint_optimizer is not provided, this will also be used to optimize
+        the Lagrange multiplier analogues.
+      constraint_optimizer: optional tf.train.Optimizer, used to optimize the
+        Lagrange multiplier analogues.
+
+    Returns:
+      A new `AdditiveSwapRegretOptimizer`.
+    """
+    # TODO(acotter): add a parameter determining the initial values of the
+    # matrix elements (like initial_multiplier_radius in
+    # MultiplicativeSwapRegretOptimizer).
+    super(AdditiveSwapRegretOptimizer, self).__init__(
+        optimizer=optimizer, constraint_optimizer=constraint_optimizer)
+
+  def _initial_state(self, num_constraints):
+    # For an AdditiveSwapRegretOptimizer, the internal state is a tensor of
+    # shape (m+1,m+1), where m is the number of constraints, representing a
+    # left-stochastic matrix.
+    dimension = num_constraints + 1
+    # Initialize by putting all weight on the objective, and none on the
+    # constraints.
+    return standard_ops.concat(
+        (standard_ops.ones(
+            (1, dimension)), standard_ops.zeros((dimension - 1, dimension))),
+        axis=0)
+
+  def _stochastic_matrix(self, state):
+    return state
+
+  def _constraint_grad_and_var(self, state, gradient):
+    # TODO(acotter): tf.colocate_with(), if colocate_gradients_with_ops is True?
+    return (-gradient, state)
+
+  def _projection_op(self, state, name=None):
+    with ops.colocate_with(state):
+      return state_ops.assign(
+          state,
+          _project_stochastic_matrix_wrt_euclidean_norm(state),
+          name=name)
+
+
+class MultiplicativeSwapRegretOptimizer(_SwapRegretOptimizer):
+  """A `ConstrainedOptimizer` based on swap-regret minimization.
+
+  This `ConstrainedOptimizer` uses the given `tf.train.Optimizer`s to jointly
+  minimize over the model parameters, and maximize over constraint/objective
+  weight matrix (the analogue of Lagrange multipliers), with the latter
+  maximization using multiplicative updates and an algorithm that minimizes swap
+  regret.
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  The formulation used by this optimizer can be found in Definition 2, and is
+  discussed in Section 4. It is most similar to Algorithm 2 in Section 4, with
+  the difference being that it uses `tf.train.Optimizer`s, instead of SGD, for
+  the "inner" updates.
+  """
+
+  def __init__(self,
+               optimizer,
+               constraint_optimizer=None,
+               minimum_multiplier_radius=1e-3,
+               initial_multiplier_radius=None):
+    """Constructs a new `MultiplicativeSwapRegretOptimizer`.
+
+    Args:
+      optimizer: tf.train.Optimizer, used to optimize the objective and
+        proxy_constraints portion of ConstrainedMinimizationProblem. If
+        constraint_optimizer is not provided, this will also be used to optimize
+        the Lagrange multiplier analogues.
+      constraint_optimizer: optional tf.train.Optimizer, used to optimize the
+        Lagrange multiplier analogues.
+      minimum_multiplier_radius: float, each element of the matrix will be lower
+        bounded by `minimum_multiplier_radius` divided by one plus the number of
+        constraints.
+      initial_multiplier_radius: float, the initial value of each element of the
+        matrix associated with a constraint (i.e. excluding those elements
+        associated with the objective) will be `initial_multiplier_radius`
+        divided by one plus the number of constraints. Defaults to the value of
+        `minimum_multiplier_radius`.
+
+    Returns:
+      A new `MultiplicativeSwapRegretOptimizer`.
+
+    Raises:
+      ValueError: If the two radius parameters are inconsistent.
+    """
+    super(MultiplicativeSwapRegretOptimizer, self).__init__(
+        optimizer=optimizer, constraint_optimizer=constraint_optimizer)
+
+    if (minimum_multiplier_radius <= 0.0) or (minimum_multiplier_radius >= 1.0):
+      raise ValueError("minimum_multiplier_radius must be in the range (0,1)")
+    if initial_multiplier_radius is None:
+      initial_multiplier_radius = minimum_multiplier_radius
+    elif (initial_multiplier_radius <
+          minimum_multiplier_radius) or (minimum_multiplier_radius > 1.0):
+      raise ValueError("initial_multiplier_radius must be in the range "
+                       "[minimum_multiplier_radius,1]")
+
+    self._minimum_multiplier_radius = minimum_multiplier_radius
+    self._initial_multiplier_radius = initial_multiplier_radius
+
+  def _initial_state(self, num_constraints):
+    # For a MultiplicativeSwapRegretOptimizer, the internal state is a tensor of
+    # shape (m+1,m+1), where m is the number of constraints, representing the
+    # element-wise logarithm of a left-stochastic matrix.
+    dimension = num_constraints + 1
+    # Initialize by putting as much weight as possible on the objective, and as
+    # little as possible on the constraints.
+    log_initial_one = math.log(1.0 - (self._initial_multiplier_radius *
+                                      (dimension - 1) / (dimension)))
+    log_initial_zero = math.log(self._initial_multiplier_radius / dimension)
+    return standard_ops.concat(
+        (standard_ops.constant(
+            log_initial_one, dtype=dtypes.float32, shape=(1, dimension)),
+         standard_ops.constant(
+             log_initial_zero,
+             dtype=dtypes.float32,
+             shape=(dimension - 1, dimension))),
+        axis=0)
+
+  def _stochastic_matrix(self, state):
+    return standard_ops.exp(state)
+
+  def _constraint_grad_and_var(self, state, gradient):
+    # TODO(acotter): tf.colocate_with(), if colocate_gradients_with_ops is True?
+    return (-gradient, state)
+
+  def _projection_op(self, state, name=None):
+    with ops.colocate_with(state):
+      # Gets the dimension of the state (num_constraints + 1)--all of these
+      # assertions are of things that should be impossible, since the state
+      # passed into this method will have the same shape as that returned by
+      # _initial_state().
+      state_shape = state.get_shape()
+      assert state_shape is not None
+      assert state_shape.ndims == 2
+      assert state_shape[0] == state_shape[1]
+      dimension = state_shape[0].value
+      assert dimension is not None
+
+      minimum_log_multiplier = standard_ops.log(
+          self._minimum_multiplier_radius / standard_ops.to_float(dimension))
+
+      return state_ops.assign(
+          state,
+          standard_ops.maximum(
+              _project_log_stochastic_matrix_wrt_kl_divergence(state),
+              minimum_log_multiplier),
+          name=name)
diff --git a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer_test.py b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer_test.py
new file mode 100644
index 0000000000..34c4543dca
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer_test.py
@@ -0,0 +1,212 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for constrained_optimization.python.swap_regret_optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.constrained_optimization.python import swap_regret_optimizer
+from tensorflow.contrib.constrained_optimization.python import test_util
+
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+
+
+class AdditiveSwapRegretOptimizerWrapper(
+    swap_regret_optimizer.AdditiveSwapRegretOptimizer):
+  """Testing wrapper class around AdditiveSwapRegretOptimizer.
+
+  This class is identical to AdditiveSwapRegretOptimizer, except that it caches
+  the internal optimization state when _stochastic_matrix() is called, so that
+  we can test that the stochastic matrices take on their expected values.
+  """
+
+  def __init__(self, optimizer, constraint_optimizer=None):
+    """Same as AdditiveSwapRegretOptimizer.__init__()."""
+    super(AdditiveSwapRegretOptimizerWrapper, self).__init__(
+        optimizer=optimizer, constraint_optimizer=constraint_optimizer)
+    self._cached_stochastic_matrix = None
+
+  @property
+  def stochastic_matrix(self):
+    """Returns the cached stochastic matrix."""
+    return self._cached_stochastic_matrix
+
+  def _stochastic_matrix(self, state):
+    """Caches the internal state for testing."""
+    self._cached_stochastic_matrix = super(AdditiveSwapRegretOptimizerWrapper,
+                                           self)._stochastic_matrix(state)
+    return self._cached_stochastic_matrix
+
+
+class MultiplicativeSwapRegretOptimizerWrapper(
+    swap_regret_optimizer.MultiplicativeSwapRegretOptimizer):
+  """Testing wrapper class around MultiplicativeSwapRegretOptimizer.
+
+  This class is identical to MultiplicativeSwapRegretOptimizer, except that it
+  caches the internal optimization state when _stochastic_matrix() is called, so
+  that we can test that the stochastic matrices take on their expected values.
+  """
+
+  def __init__(self,
+               optimizer,
+               constraint_optimizer=None,
+               minimum_multiplier_radius=None,
+               initial_multiplier_radius=None):
+    """Same as MultiplicativeSwapRegretOptimizer.__init__()."""
+    super(MultiplicativeSwapRegretOptimizerWrapper, self).__init__(
+        optimizer=optimizer,
+        constraint_optimizer=constraint_optimizer,
+        minimum_multiplier_radius=1e-3,
+        initial_multiplier_radius=initial_multiplier_radius)
+    self._cached_stochastic_matrix = None
+
+  @property
+  def stochastic_matrix(self):
+    """Returns the cached stochastic matrix."""
+    return self._cached_stochastic_matrix
+
+  def _stochastic_matrix(self, state):
+    """Caches the internal state for testing."""
+    self._cached_stochastic_matrix = super(
+        MultiplicativeSwapRegretOptimizerWrapper,
+        self)._stochastic_matrix(state)
+    return self._cached_stochastic_matrix
+
+
+class SwapRegretOptimizerTest(test.TestCase):
+
+  def test_maximum_eigenvector_power_method(self):
+    """Tests power method routine on some known left-stochastic matrices."""
+    matrix1 = np.matrix([[0.6, 0.1, 0.1], [0.0, 0.6, 0.9], [0.4, 0.3, 0.0]])
+    matrix2 = np.matrix([[0.4, 0.4, 0.2], [0.2, 0.1, 0.5], [0.4, 0.5, 0.3]])
+
+    with self.test_session() as session:
+      eigenvector1 = session.run(
+          swap_regret_optimizer._maximal_eigenvector_power_method(
+              standard_ops.constant(matrix1)))
+      eigenvector2 = session.run(
+          swap_regret_optimizer._maximal_eigenvector_power_method(
+              standard_ops.constant(matrix2)))
+
+    # Check that eigenvector1 and eigenvector2 are eigenvectors of matrix1 and
+    # matrix2 (respectively) with associated eigenvalue 1.
+    matrix_eigenvector1 = np.tensordot(matrix1, eigenvector1, axes=1)
+    matrix_eigenvector2 = np.tensordot(matrix2, eigenvector2, axes=1)
+    self.assertAllClose(eigenvector1, matrix_eigenvector1, rtol=0, atol=1e-6)
+    self.assertAllClose(eigenvector2, matrix_eigenvector2, rtol=0, atol=1e-6)
+
+  def test_project_stochastic_matrix_wrt_euclidean_norm(self):
+    """Tests Euclidean projection routine on some known values."""
+    matrix = standard_ops.constant([[-0.1, -0.1, 0.4], [-0.8, 0.4, 1.2],
+                                    [-0.3, 0.1, 0.2]])
+    expected_projected_matrix = np.array([[0.6, 0.1, 0.1], [0.0, 0.6, 0.9],
+                                          [0.4, 0.3, 0.0]])
+
+    with self.test_session() as session:
+      projected_matrix = session.run(
+          swap_regret_optimizer._project_stochastic_matrix_wrt_euclidean_norm(
+              matrix))
+
+    self.assertAllClose(
+        expected_projected_matrix, projected_matrix, rtol=0, atol=1e-6)
+
+  def test_project_log_stochastic_matrix_wrt_kl_divergence(self):
+    """Tests KL-divergence projection routine on some known values."""
+    matrix = standard_ops.constant([[0.2, 0.8, 0.6], [0.1, 0.2, 1.5],
+                                    [0.2, 1.0, 0.9]])
+    expected_projected_matrix = np.array([[0.4, 0.4, 0.2], [0.2, 0.1, 0.5],
+                                          [0.4, 0.5, 0.3]])
+
+    with self.test_session() as session:
+      projected_matrix = session.run(
+          standard_ops.exp(
+              swap_regret_optimizer.
+              _project_log_stochastic_matrix_wrt_kl_divergence(
+                  standard_ops.log(matrix))))
+
+    self.assertAllClose(
+        expected_projected_matrix, projected_matrix, rtol=0, atol=1e-6)
+
+  def test_additive_swap_regret_optimizer(self):
+    """Tests that the stochastic matrices update as expected."""
+    minimization_problem = test_util.ConstantMinimizationProblem(
+        np.array([0.6, -0.1, 0.4]))
+    optimizer = AdditiveSwapRegretOptimizerWrapper(
+        gradient_descent.GradientDescentOptimizer(1.0))
+    train_op = optimizer.minimize_constrained(minimization_problem)
+
+    # Calculated using a numpy+python implementation of the algorithm.
+    expected_matrices = [
+        np.array([[1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 0.0, 0.0],
+                  [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]]),
+        np.array([[0.66666667, 1.0, 1.0, 1.0], [0.26666667, 0.0, 0.0, 0.0],
+                  [0.0, 0.0, 0.0, 0.0], [0.06666667, 0.0, 0.0, 0.0]]),
+        np.array([[0.41666667, 0.93333333, 1.0,
+                   0.98333333], [0.46666667, 0.05333333, 0.0,
+                                 0.01333333], [0.0, 0.0, 0.0, 0.0],
+                  [0.11666667, 0.01333333, 0.0, 0.00333333]]),
+    ]
+
+    matrices = []
+    with self.test_session() as session:
+      session.run(standard_ops.global_variables_initializer())
+      while len(matrices) < len(expected_matrices):
+        matrices.append(session.run(optimizer.stochastic_matrix))
+        session.run(train_op)
+
+    for expected, actual in zip(expected_matrices, matrices):
+      self.assertAllClose(expected, actual, rtol=0, atol=1e-6)
+
+  def test_multiplicative_swap_regret_optimizer(self):
+    """Tests that the stochastic matrices update as expected."""
+    minimization_problem = test_util.ConstantMinimizationProblem(
+        np.array([0.6, -0.1, 0.4]))
+    optimizer = MultiplicativeSwapRegretOptimizerWrapper(
+        gradient_descent.GradientDescentOptimizer(1.0),
+        initial_multiplier_radius=0.8)
+    train_op = optimizer.minimize_constrained(minimization_problem)
+
+    # Calculated using a numpy+python implementation of the algorithm.
+    expected_matrices = [
+        np.array([[0.4, 0.4, 0.4, 0.4], [0.2, 0.2, 0.2, 0.2],
+                  [0.2, 0.2, 0.2, 0.2], [0.2, 0.2, 0.2, 0.2]]),
+        np.array([[0.36999014, 0.38528351, 0.38528351, 0.38528351], [
+            0.23517483, 0.21720297, 0.21720297, 0.21720297
+        ], [0.17774131, 0.18882719, 0.18882719, 0.18882719],
+                  [0.21709373, 0.20868632, 0.20868632, 0.20868632]]),
+        np.array([[0.33972109, 0.36811863, 0.37118462, 0.36906575], [
+            0.27114826, 0.23738228, 0.23376693, 0.23626491
+        ], [0.15712313, 0.17641793, 0.17858959, 0.17708679],
+                  [0.23200752, 0.21808115, 0.21645886, 0.21758255]]),
+    ]
+
+    matrices = []
+    with self.test_session() as session:
+      session.run(standard_ops.global_variables_initializer())
+      while len(matrices) < len(expected_matrices):
+        matrices.append(session.run(optimizer.stochastic_matrix))
+        session.run(train_op)
+
+    for expected, actual in zip(expected_matrices, matrices):
+      self.assertAllClose(expected, actual, rtol=0, atol=1e-6)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/constrained_optimization/python/test_util.py b/tensorflow/contrib/constrained_optimization/python/test_util.py
new file mode 100644
index 0000000000..704b36ca4c
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/test_util.py
@@ -0,0 +1,58 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains helpers used by tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.constrained_optimization.python import constrained_minimization_problem
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import standard_ops
+
+
+class ConstantMinimizationProblem(
+    constrained_minimization_problem.ConstrainedMinimizationProblem):
+  """A `ConstrainedMinimizationProblem` with constant constraint violations.
+
+  This minimization problem is intended for use in performing simple tests of
+  the Lagrange multiplier (or equivalent) update in the optimizers. There is a
+  one-element "dummy" model parameter, but it should be ignored.
+  """
+
+  def __init__(self, constraints):
+    """Constructs a new `ConstantMinimizationProblem'.
+
+    Args:
+      constraints: 1d numpy array, the constant constraint violations.
+
+    Returns:
+      A new `ConstantMinimizationProblem'.
+    """
+    # We make an fake 1-parameter linear objective so that we don't get a "no
+    # variables to optimize" error.
+    self._objective = standard_ops.Variable(0.0, dtype=dtypes.float32)
+    self._constraints = standard_ops.constant(constraints, dtype=dtypes.float32)
+
+  @property
+  def objective(self):
+    """Returns the objective function."""
+    return self._objective
+
+  @property
+  def constraints(self):
+    """Returns the constant constraint violations."""
+    return self._constraints
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 7b508f87ab..677ea65edd 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -63,6 +63,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/autograph/pyct/static_analysis:static_analysis",
     "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
     "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
+    "//tensorflow/contrib/constrained_optimization:constrained_optimization_pip",
     "//tensorflow/contrib/data/python/kernel_tests:dataset_serialization_test",
     "//tensorflow/contrib/data/python/ops:contrib_op_loader",
     "//tensorflow/contrib/eager/python/examples:examples_pip",
-- 
GitLab


From 762fa5f6ead8f662e5cc14420293cb369f2b9615 Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Mon, 23 Apr 2018 15:57:16 -0700
Subject: [PATCH 330/450] FakeQuant operations before ReLUs (occurs after
 bypass nodes) aren't needed.

PiperOrigin-RevId: 193999591
---
 .../contrib/quantize/python/quantize.py       | 68 ++++++++++++-------
 .../quantize/python/quantize_graph_test.py    | 14 ----
 .../contrib/quantize/python/quantize_test.py  | 57 ++++++++++++----
 3 files changed, 87 insertions(+), 52 deletions(-)

diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index d2d0426d23..efc1a94b3c 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -133,19 +133,27 @@ def Quantize(graph,
           bits=activation_bits,
           producer_scope=scope,
           consumer_scope=scope)
-      _InsertQuantOp(
-          add_context,
-          'add_quant',
-          layer_match.bypass_op,
-          input_to_ops_map.ConsumerOperations(layer_match.bypass_op),
-          is_training,
-          moving_avg=True,
-          ema_decay=ema_decay,
-          quant_delay=quant_delay,
-          vars_collection=vars_collection,
-          bits=activation_bits,
-          producer_scope=scope,
-          consumer_scope=scope)
+      # Make sure the op following this isn't an activation. In which case, we
+      # shouldn't quantize it, since the activation will be Fused into the
+      # Add at inference time.
+      consumers = input_to_ops_map.ConsumerOperations(layer_match.bypass_op)
+      if any([consumer.type in _ACTIVATION_TYPES for consumer in consumers]):
+        logging.info('Skipping %s, because its followed by an activation.',
+                     layer_match.bypass_op.name)
+      else:
+        _InsertQuantOp(
+            add_context,
+            'add_quant',
+            layer_match.bypass_op,
+            input_to_ops_map.ConsumerOperations(layer_match.bypass_op),
+            is_training,
+            moving_avg=True,
+            ema_decay=ema_decay,
+            quant_delay=quant_delay,
+            vars_collection=vars_collection,
+            bits=activation_bits,
+            producer_scope=scope,
+            consumer_scope=scope)
 
     # Quantize bypass ops that occur after the activation.
     if layer_match.post_activation_bypass_op is not None:
@@ -153,19 +161,27 @@ def Quantize(graph,
           r'^(.*)/([^/]+)', layer_match.post_activation_bypass_op.name).group(1)
       # If `scope` is given, only quantize it if the producer is in the right
       # scope.
-      _InsertQuantOp(
-          post_activation_bypass_context,
-          'post_activation_bypass_quant',
-          layer_match.post_activation_bypass_op,
-          input_to_ops_map.ConsumerOperations(
-              layer_match.post_activation_bypass_op),
-          is_training,
-          moving_avg=True,
-          ema_decay=ema_decay,
-          quant_delay=quant_delay,
-          vars_collection=vars_collection,
-          bits=activation_bits,
-          producer_scope=scope)
+      # Make sure the op following this isn't an activation. In which case, we
+      # shouldn't quantize it, since the activation will be Fused into the
+      # Add at inference time.
+      consumers = input_to_ops_map.ConsumerOperations(
+          layer_match.post_activation_bypass_op)
+      if any([consumer.type in _ACTIVATION_TYPES for consumer in consumers]):
+        logging.info('Skipping %s, because its followed by an activation.',
+                     layer_match.post_activation_bypass_op.name)
+      else:
+        _InsertQuantOp(
+            post_activation_bypass_context,
+            'post_activation_bypass_quant',
+            layer_match.post_activation_bypass_op,
+            consumers,
+            is_training,
+            moving_avg=True,
+            ema_decay=ema_decay,
+            quant_delay=quant_delay,
+            vars_collection=vars_collection,
+            bits=activation_bits,
+            producer_scope=scope)
 
 
 def _FindLayersToQuantize(graph):
diff --git a/tensorflow/contrib/quantize/python/quantize_graph_test.py b/tensorflow/contrib/quantize/python/quantize_graph_test.py
index caf8ff28d5..54faf582f1 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph_test.py
@@ -113,20 +113,6 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
       # Ensure that variables were added.
       self.assertTrue(len(orig_variable_names) < len(q_variables))
 
-  def testWithPreActivationBypass(self):
-    self._RunTestOverAllRewrites(self._TestWithPreActivationBypass)
-
-  def _TestWithPreActivationBypass(self, rewrite_fn):
-    # Tests that the default graph is correctly used when no args are provided
-    # to rewrite_fn.
-    with ops.Graph().as_default() as g:
-      self._ConvLayer(pre_activation_bypass=True, scope='scope1')
-      rewrite_fn()
-
-      op_names = [op.name for op in g.get_operations()]
-      self.assertTrue(
-          any('scope1/add_quant/' in name for name in op_names))
-
   def testWithPostActivationBypass(self):
     self._RunTestOverAllRewrites(self._TestWithPostActivationBypass)
 
diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py
index d37c83d683..5e479f3946 100644
--- a/tensorflow/contrib/quantize/python/quantize_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_test.py
@@ -82,9 +82,22 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
 
     quantization_node_name = 'FakeQuantWithMinMaxVars'
-    add_quant = graph.get_operation_by_name('test/add_quant/' +
-                                            quantization_node_name)
-    self.assertEqual(add_quant.type, quantization_node_name)
+    conv_quant = graph.get_operation_by_name('test/test/conv_quant/' +
+                                             quantization_node_name)
+    self.assertEqual(conv_quant.type, quantization_node_name)
+
+    # Scan through all FakeQuant operations, ensuring that the activation
+    # isn't in the consumers of the operation. Since activations are folded
+    # the preceding operation during inference, the FakeQuant operation after
+    # the activation is all that is needed.
+    for op in graph.get_operations():
+      if op.type == quantization_node_name:
+        quant_op = graph.get_operation_by_name(op.name)
+        consumers = []
+        for output in quant_op.outputs:
+          consumers.extend(output.consumers())
+
+        self.assertNotIn('test/identity', [c.name for c in consumers])
 
   def testInsertQuantOpForAddAfterSeparableConv2d(self):
     self._RunTestOverParameters(
@@ -109,9 +122,20 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
 
     quantization_node_name = 'FakeQuantWithMinMaxVars'
-    add_quant = graph.get_operation_by_name('test/add_quant/' +
-                                            quantization_node_name)
-    self.assertEqual(add_quant.type, quantization_node_name)
+    conv_quant = graph.get_operation_by_name('test/test/conv_quant/' +
+                                             quantization_node_name)
+    self.assertEqual(conv_quant.type, quantization_node_name)
+
+    for op in graph.get_operations():
+      if op.type == quantization_node_name:
+        quant_op = graph.get_operation_by_name(op.name)
+        # Scan through all FakeQuant operations, ensuring that the activation
+        # identity op isn't in the consumers of the operation.
+        consumers = []
+        for output in quant_op.outputs:
+          consumers.extend(output.consumers())
+
+        self.assertNotIn('test/identity', [c.name for c in consumers])
 
   def testFinalLayerQuantized(self):
     self._RunTestOverParameters(self._TestFinalLayerQuantized)
@@ -153,12 +177,21 @@ class QuantizeTest(test_util.TensorFlowTestCase):
           activation_fn=array_ops.identity,
           scope='test/test')
       bypass_tensor = math_ops.add(conv, input2, name='test/add')
-      _ = array_ops.identity(bypass_tensor, name='test/output')
+      # The output of the post_activation bypass will be another layer.
+      _ = conv2d(
+          bypass_tensor,
+          32, [5, 5],
+          stride=2,
+          padding='SAME',
+          weights_initializer=self._WeightInit(0.09),
+          activation_fn=array_ops.identity,
+          scope='test/unused')
 
       quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
 
-      # Ensure that the bypass node is preceded and followed by
-      # FakeQuantWithMinMaxVars operations.
+      # Ensure that the bypass node is preceded by and followed by a
+      # FakeQuantWithMinMaxVar operation, since the output of the Add isn't an
+      # activation.
       self.assertTrue('FakeQuantWithMinMaxVars' in
                       [c.type for c in bypass_tensor.consumers()])
       self.assertTrue('FakeQuantWithMinMaxVars' in
@@ -198,9 +231,9 @@ class QuantizeTest(test_util.TensorFlowTestCase):
 
       quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
 
-      # Ensure that the bypass node is preceded and followed by
-      # FakeQuantWithMinMaxVars operations.
-      self.assertTrue('FakeQuantWithMinMaxVars' in
+      # Ensure that the bypass node is preceded by a FakeQuantWithMinMaxVar
+      # operation, and NOT followed by one.
+      self.assertTrue('FakeQuantWithMinMaxVars' not in
                       [c.type for c in bypass_tensor.consumers()])
       self.assertTrue('FakeQuantWithMinMaxVars' in
                       [i.op.type for i in bypass_tensor.op.inputs])
-- 
GitLab


From 5809ad4436863ac82279c66d6cff6a4bffd77878 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Mon, 23 Apr 2018 16:27:00 -0700
Subject: [PATCH 331/450] Add `static_state_saving_rnn` back to the `nn`
 module.

PiperOrigin-RevId: 194003971
---
 tensorflow/python/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 13f8420a67..c1702ae13c 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -160,6 +160,7 @@ nn.dynamic_rnn = rnn.dynamic_rnn
 nn.static_rnn = rnn.static_rnn
 nn.raw_rnn = rnn.raw_rnn
 nn.bidirectional_dynamic_rnn = rnn.bidirectional_dynamic_rnn
+nn.static_state_saving_rnn = rnn.static_state_saving_rnn
 nn.rnn_cell = rnn_cell
 
 # Symbols whitelisted for export without documentation.
-- 
GitLab


From ba39780114c648445d3285550bf7f5c1e9e8a251 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 16:29:27 -0700
Subject: [PATCH 332/450] Avoid inlining the split handler functions as it
 slows down the trainer startup significantly.

PiperOrigin-RevId: 194004319
---
 .../learner/batch/ordinal_split_handler.py    | 32 +++++++++++++++----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
index 7df514cd20..9d6cc9245a 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
@@ -417,9 +417,18 @@ class SparseSplitHandler(InequalitySplitHandler):
     return (are_splits_ready, partition_ids, gains, split_infos)
 
 
-@function.Defun(dtypes.bool, dtypes.bool, dtypes.float32, dtypes.float32,
-                dtypes.int32, dtypes.float32, dtypes.float32, dtypes.float32,
-                dtypes.float32, dtypes.float32)
+@function.Defun(
+    dtypes.bool,
+    dtypes.bool,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.int32,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.float32,
+    noinline=True)
 def dense_make_stats_update(is_active, are_buckets_ready, float_column,
                             quantile_buckets, example_partition_ids, gradients,
                             hessians, weights, empty_gradients, empty_hessians):
@@ -452,9 +461,20 @@ def dense_make_stats_update(is_active, are_buckets_ready, float_column,
           gradients, hessians)
 
 
-@function.Defun(dtypes.bool, dtypes.bool, dtypes.int64, dtypes.float32,
-                dtypes.int64, dtypes.float32, dtypes.int32, dtypes.float32,
-                dtypes.float32, dtypes.float32, dtypes.float32, dtypes.float32)
+@function.Defun(
+    dtypes.bool,
+    dtypes.bool,
+    dtypes.int64,
+    dtypes.float32,
+    dtypes.int64,
+    dtypes.float32,
+    dtypes.int32,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.float32,
+    noinline=True)
 def sparse_make_stats_update(
     is_active, are_buckets_ready, sparse_column_indices, sparse_column_values,
     sparse_column_shape, quantile_buckets, example_partition_ids, gradients,
-- 
GitLab


From a72155d58726d4dbb92d5d6b0f3290976bbdaa1c Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Mon, 23 Apr 2018 16:33:27 -0700
Subject: [PATCH 333/450] Small fast path for binary_op_wrapper

PiperOrigin-RevId: 194004866
---
 tensorflow/python/ops/math_ops.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 01d670ea2d..2b04866fef 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -965,7 +965,9 @@ def _OverrideBinaryOperatorHelper(func, op_name, clazz_object=ops.Tensor):
 
   def binary_op_wrapper(x, y):
     with ops.name_scope(None, op_name, [x, y]) as name:
-      if not isinstance(y, sparse_tensor.SparseTensor):
+      if isinstance(x, ops.Tensor) and isinstance(y, ops.Tensor):
+        return func(x, y, name=name)
+      elif not isinstance(y, sparse_tensor.SparseTensor):
         try:
           y = ops.convert_to_tensor(y, dtype=x.dtype.base_dtype, name="y")
         except TypeError:
-- 
GitLab


From 84c73c2b4d0318bfd78a53ab6051169795604650 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Mon, 23 Apr 2018 16:46:41 -0700
Subject: [PATCH 334/450] TFTS: Support exogenous features in ARRegressor

They get flattened with the endogenous features as input to the model. Unlike
endogenous features, they're specified for the whole window when making
predictions.

Adds an ARRegressor example which uses exogenous features.

PiperOrigin-RevId: 194006630
---
 .../timeseries/examples/known_anomaly.py      |  75 +++++---
 .../timeseries/examples/known_anomaly_test.py |  18 +-
 .../timeseries/python/timeseries/ar_model.py  | 173 ++++++++++++++----
 .../python/timeseries/ar_model_test.py        |   8 +-
 .../python/timeseries/estimators.py           |  11 +-
 .../python/timeseries/estimators_test.py      |  48 +++--
 6 files changed, 255 insertions(+), 78 deletions(-)

diff --git a/tensorflow/contrib/timeseries/examples/known_anomaly.py b/tensorflow/contrib/timeseries/examples/known_anomaly.py
index e77628ddd3..71621abc71 100644
--- a/tensorflow/contrib/timeseries/examples/known_anomaly.py
+++ b/tensorflow/contrib/timeseries/examples/known_anomaly.py
@@ -41,17 +41,8 @@ _MODULE_PATH = path.dirname(__file__)
 _DATA_FILE = path.join(_MODULE_PATH, "data/changepoints.csv")
 
 
-def train_and_evaluate_exogenous(csv_file_name=_DATA_FILE, train_steps=300):
-  """Training, evaluating, and predicting on a series with changepoints."""
-
-  # Indicate the format of our exogenous feature, in this case a string
-  # representing a boolean value.
-  string_feature = tf.feature_column.categorical_column_with_vocabulary_list(
-      key="is_changepoint", vocabulary_list=["no", "yes"])
-  # Specify the way this feature is presented to the model, here using a one-hot
-  # encoding.
-  one_hot_feature = tf.feature_column.indicator_column(
-      categorical_column=string_feature)
+def state_space_esitmator(exogenous_feature_columns):
+  """Constructs a StructuralEnsembleRegressor."""
 
   def _exogenous_update_condition(times, features):
     del times  # unused
@@ -62,14 +53,48 @@ def train_and_evaluate_exogenous(csv_file_name=_DATA_FILE, train_steps=300):
     # no changepoint.
     return tf.equal(tf.squeeze(features["is_changepoint"], axis=-1), "yes")
 
-  estimator = tf.contrib.timeseries.StructuralEnsembleRegressor(
-      periodicities=12,
-      # Extract a smooth period by constraining the number of latent values
-      # being cycled between.
-      cycle_num_latent_values=3,
-      num_features=1,
-      exogenous_feature_columns=[one_hot_feature],
-      exogenous_update_condition=_exogenous_update_condition)
+  return (
+      tf.contrib.timeseries.StructuralEnsembleRegressor(
+          periodicities=12,
+          # Extract a smooth period by constraining the number of latent values
+          # being cycled between.
+          cycle_num_latent_values=3,
+          num_features=1,
+          exogenous_feature_columns=exogenous_feature_columns,
+          exogenous_update_condition=_exogenous_update_condition),
+      # Use truncated backpropagation with a window size of 64, batching
+      # together 4 of these windows (random offsets) per training step. Training
+      # with exogenous features often requires somewhat larger windows.
+      4, 64)
+
+
+def autoregressive_esitmator(exogenous_feature_columns):
+  input_window_size = 8
+  output_window_size = 2
+  return (
+      tf.contrib.timeseries.ARRegressor(
+          periodicities=12,
+          num_features=1,
+          input_window_size=input_window_size,
+          output_window_size=output_window_size,
+          exogenous_feature_columns=exogenous_feature_columns),
+      64, input_window_size + output_window_size)
+
+
+def train_and_evaluate_exogenous(
+    estimator_fn, csv_file_name=_DATA_FILE, train_steps=300):
+  """Training, evaluating, and predicting on a series with changepoints."""
+  # Indicate the format of our exogenous feature, in this case a string
+  # representing a boolean value.
+  string_feature = tf.feature_column.categorical_column_with_vocabulary_list(
+      key="is_changepoint", vocabulary_list=["no", "yes"])
+  # Specify the way this feature is presented to the model, here using a one-hot
+  # encoding.
+  one_hot_feature = tf.feature_column.indicator_column(
+      categorical_column=string_feature)
+
+  estimator, batch_size, window_size = estimator_fn(
+      exogenous_feature_columns=[one_hot_feature])
   reader = tf.contrib.timeseries.CSVReader(
       csv_file_name,
       # Indicate the format of our CSV file. First we have two standard columns,
@@ -85,10 +110,7 @@ def train_and_evaluate_exogenous(csv_file_name=_DATA_FILE, train_steps=300):
       # This CSV has a header line; here we just ignore it.
       skip_header_lines=1)
   train_input_fn = tf.contrib.timeseries.RandomWindowInputFn(
-      # Use truncated backpropagation with a window size of 64, batching
-      # together 4 of these windows (random offsets) per training step. Training
-      # with exogenous features often requires somewhat larger windows.
-      reader, batch_size=4, window_size=64)
+      reader, batch_size=batch_size, window_size=window_size)
   estimator.train(input_fn=train_input_fn, steps=train_steps)
   evaluation_input_fn = tf.contrib.timeseries.WholeDatasetInputFn(reader)
   evaluation = estimator.evaluate(input_fn=evaluation_input_fn, steps=1)
@@ -145,7 +167,12 @@ def main(unused_argv):
   if not HAS_MATPLOTLIB:
     raise ImportError(
         "Please install matplotlib to generate a plot from this example.")
-  make_plot("Ignoring a known anomaly", *train_and_evaluate_exogenous())
+  make_plot("Ignoring a known anomaly (state space)",
+            *train_and_evaluate_exogenous(
+                estimator_fn=state_space_esitmator))
+  make_plot("Ignoring a known anomaly (autoregressive)",
+            *train_and_evaluate_exogenous(
+                estimator_fn=autoregressive_esitmator, train_steps=3000))
   pyplot.show()
 
 
diff --git a/tensorflow/contrib/timeseries/examples/known_anomaly_test.py b/tensorflow/contrib/timeseries/examples/known_anomaly_test.py
index c3e307cad8..8c64f2e186 100644
--- a/tensorflow/contrib/timeseries/examples/known_anomaly_test.py
+++ b/tensorflow/contrib/timeseries/examples/known_anomaly_test.py
@@ -23,12 +23,24 @@ from tensorflow.contrib.timeseries.examples import known_anomaly
 from tensorflow.python.platform import test
 
 
-class KnownAnaomalyExampleTest(test.TestCase):
+class KnownAnomalyExampleTest(test.TestCase):
 
-  def test_shapes_and_variance_structural(self):
+  def test_shapes_and_variance_structural_ar(self):
     (times, observed, all_times, mean, upper_limit, lower_limit,
      anomaly_locations) = known_anomaly.train_and_evaluate_exogenous(
-         train_steps=50)
+         train_steps=1, estimator_fn=known_anomaly.autoregressive_esitmator)
+    self.assertAllEqual(
+        anomaly_locations,
+        [25, 50, 75, 100, 125, 150, 175, 249])
+    self.assertAllEqual(all_times.shape, mean.shape)
+    self.assertAllEqual(all_times.shape, upper_limit.shape)
+    self.assertAllEqual(all_times.shape, lower_limit.shape)
+    self.assertAllEqual(times.shape, observed.shape)
+
+  def test_shapes_and_variance_structural_ssm(self):
+    (times, observed, all_times, mean, upper_limit, lower_limit,
+     anomaly_locations) = known_anomaly.train_and_evaluate_exogenous(
+         train_steps=50, estimator_fn=known_anomaly.state_space_esitmator)
     self.assertAllEqual(
         anomaly_locations,
         [25, 50, 75, 100, 125, 150, 175, 249])
diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
index 4f6527a546..558d9480b4 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
@@ -60,7 +60,8 @@ class ARModel(model.TimeSeriesModel):
                num_features,
                num_time_buckets=10,
                loss=NORMAL_LIKELIHOOD_LOSS,
-               hidden_layer_sizes=None):
+               hidden_layer_sizes=None,
+               exogenous_feature_columns=None):
     """Constructs an auto-regressive model.
 
     Args:
@@ -81,6 +82,11 @@ class ARModel(model.TimeSeriesModel):
         observations and predictions, while the training loss is computed on
         normalized data (if input statistics are available).
       hidden_layer_sizes: list of sizes of hidden layers.
+      exogenous_feature_columns: A list of `tf.feature_column`s (for example
+          `tf.feature_column.embedding_column`) corresponding to exogenous
+          features which provide extra information to the model but are not part
+          of the series to be predicted. Passed to
+          `tf.feature_column.input_layer`.
     """
     self.input_window_size = input_window_size
     self.output_window_size = output_window_size
@@ -90,7 +96,12 @@ class ARModel(model.TimeSeriesModel):
     self.window_size = self.input_window_size + self.output_window_size
     self.loss = loss
     super(ARModel, self).__init__(
-        num_features=num_features)
+        num_features=num_features,
+        exogenous_feature_columns=exogenous_feature_columns)
+    if exogenous_feature_columns is not None:
+      self.exogenous_size = self._get_exogenous_embedding_shape()[-1]
+    else:
+      self.exogenous_size = 0
     assert num_time_buckets > 0
     self._buckets = int(num_time_buckets)
     if periodicities is None or not periodicities:
@@ -110,7 +121,10 @@ class ARModel(model.TimeSeriesModel):
     # that the serving input_receiver_fn gets placeholder shapes correct.
     return (array_ops.zeros([self.input_window_size], dtype=dtypes.int64),
             array_ops.zeros(
-                [self.input_window_size, self.num_features], dtype=self.dtype))
+                [self.input_window_size, self.num_features], dtype=self.dtype),
+            array_ops.zeros(
+                [self.input_window_size, self.exogenous_size],
+                dtype=self.dtype))
 
   # TODO(allenl,agarwal): Support sampling for AR.
   def random_model_parameters(self, seed=None):
@@ -163,7 +177,7 @@ class ARModel(model.TimeSeriesModel):
       activations.append((activation, activation_size))
     return activations
 
-  def prediction_ops(self, times, values):
+  def prediction_ops(self, times, values, exogenous_regressors):
     """Compute model predictions given input data.
 
     Args:
@@ -173,6 +187,8 @@ class ARModel(model.TimeSeriesModel):
           prediction times.
       values: A [batch size, self.input_window_size, self.num_features] Tensor
           with input features.
+      exogenous_regressors: A [batch size, self.window_size,
+          self.exogenous_size] Tensor with exogenous features.
     Returns:
       Tuple (predicted_mean, predicted_covariance), where each element is a
       Tensor with shape [batch size, self.output_window_size,
@@ -183,25 +199,33 @@ class ARModel(model.TimeSeriesModel):
     if self.input_window_size:
       values.get_shape().assert_is_compatible_with(
           [None, self.input_window_size, self.num_features])
+    if exogenous_regressors is not None:
+      exogenous_regressors.get_shape().assert_is_compatible_with(
+          [None, self.window_size, self.exogenous_size])
     # Create input features.
+    activation_components = []
     if self._periods:
       _, time_features = self._compute_time_features(times)
       activation_size = self.window_size * self._buckets * len(self._periods)
-      activation = array_ops.reshape(time_features, [-1, activation_size])
+      activation_components.append(
+          array_ops.reshape(time_features, [-1, activation_size]))
     else:
       activation_size = 0
-      activation = None
-
     if self.input_window_size:
       inp = array_ops.slice(values, [0, 0, 0], [-1, self.input_window_size, -1])
       inp_size = self.input_window_size * self.num_features
       inp = array_ops.reshape(inp, [-1, inp_size])
-      if activation is not None:
-        activation = array_ops.concat([inp, activation], 1)
-      else:
-        activation = inp
+      activation_components.append(inp)
       activation_size += inp_size
+    if self.exogenous_size:
+      exogenous_size = self.window_size * self.exogenous_size
+      activation_size += exogenous_size
+      exogenous_flattened = array_ops.reshape(
+          exogenous_regressors, [-1, exogenous_size])
+      activation_components.append(exogenous_flattened)
     assert activation_size
+    assert activation_components
+    activation = array_ops.concat(activation_components, axis=1)
     activations.append((activation, activation_size))
     # Create hidden layers.
     activations += self._create_hidden_stack(activation, activation_size)
@@ -228,6 +252,19 @@ class ARModel(model.TimeSeriesModel):
         math_ops.reduce_prod(array_ops.shape(targets)), loss_op.dtype)
     return loss_op
 
+  def _process_exogenous_features(self, times, features):
+    embedded = super(ARModel, self)._process_exogenous_features(
+        times=times, features=features)
+    if embedded is None:
+      assert self.exogenous_size == 0
+      # No embeddings. Return a zero-size [batch, times, 0] array so we don't
+      # have to special case it downstream.
+      return array_ops.zeros(
+          array_ops.concat([array_ops.shape(times), constant_op.constant([0])],
+                           axis=0))
+    else:
+      return embedded
+
   # TODO(allenl, agarwal): Consider better ways of warm-starting predictions.
   def predict(self, features):
     """Computes predictions multiple steps into the future.
@@ -243,6 +280,7 @@ class ARModel(model.TimeSeriesModel):
           segment of the time series before `TIMES`. This data is used
           to start of the autoregressive computation. This should have data for
           at least self.input_window_size timesteps.
+        And any exogenous features, with shapes prefixed by shape of `TIMES`.
     Returns:
       A dictionary with keys, "mean", "covariance". The
       values are Tensors of shape [batch_size, predict window size,
@@ -250,25 +288,39 @@ class ARModel(model.TimeSeriesModel):
     """
     predict_times = math_ops.cast(
         ops.convert_to_tensor(features[PredictionFeatures.TIMES]), dtypes.int32)
+    exogenous_regressors = self._process_exogenous_features(
+        times=predict_times,
+        features={key: value for key, value in features.items()
+                  if key not in [TrainEvalFeatures.TIMES,
+                                 TrainEvalFeatures.VALUES,
+                                 PredictionFeatures.STATE_TUPLE]})
+    with ops.control_dependencies(
+        [check_ops.assert_equal(array_ops.shape(predict_times)[1],
+                                array_ops.shape(exogenous_regressors)[1])]):
+      exogenous_regressors = array_ops.identity(exogenous_regressors)
     batch_size = array_ops.shape(predict_times)[0]
     num_predict_values = array_ops.shape(predict_times)[1]
     prediction_iterations = ((num_predict_values + self.output_window_size - 1)
                              // self.output_window_size)
-    # Pad predict_times so as to have exact multiple of self.output_window_size
-    # values per example.
+    # Pad predict_times and exogenous regressors so as to have exact multiple of
+    # self.output_window_size values per example.
     padding_size = (prediction_iterations * self.output_window_size -
                     num_predict_values)
-    padding = array_ops.zeros([batch_size, padding_size], predict_times.dtype)
-    predict_times = control_flow_ops.cond(
-        padding_size > 0, lambda: array_ops.concat([predict_times, padding], 1),
-        lambda: predict_times)
+    predict_times = array_ops.pad(
+        predict_times, [[0, 0], [0, padding_size]])
+    exogenous_regressors = array_ops.pad(
+        exogenous_regressors, [[0, 0], [0, padding_size], [0, 0]])
     state = features[PredictionFeatures.STATE_TUPLE]
-    (state_times, state_values) = state
+    (state_times, state_values, state_exogenous_regressors) = state
     state_times = math_ops.cast(
         ops.convert_to_tensor(state_times), dtypes.int32)
     state_values = ops.convert_to_tensor(state_values, dtype=self.dtype)
+    state_exogenous_regressors = ops.convert_to_tensor(
+        state_exogenous_regressors, dtype=self.dtype)
 
     initial_input_times = predict_times[:, :self.output_window_size]
+    initial_input_exogenous_regressors = (
+        exogenous_regressors[:, :self.output_window_size, :])
     if self.input_window_size > 0:
       initial_input_times = array_ops.concat(
           [state_times[:, -self.input_window_size:], initial_input_times], 1)
@@ -279,6 +331,11 @@ class ARModel(model.TimeSeriesModel):
           check_ops.assert_equal(values_size, times_size)
       ]):
         initial_input_values = state_values[:, -self.input_window_size:, :]
+        initial_input_exogenous_regressors = array_ops.concat(
+            [state_exogenous_regressors[:, -self.input_window_size:, :],
+             initial_input_exogenous_regressors[
+                 :, :self.output_window_size, :]],
+            axis=1)
     else:
       initial_input_values = 0
 
@@ -288,9 +345,10 @@ class ARModel(model.TimeSeriesModel):
       return math_ops.less(iteration_number, prediction_iterations)
 
     def _while_body(iteration_number, input_times, input_values,
-                    mean_ta, covariance_ta):
+                    input_exogenous_regressors, mean_ta, covariance_ta):
       """Predict self.output_window_size values."""
-      prediction_ops = self.prediction_ops(input_times, input_values)
+      prediction_ops = self.prediction_ops(
+          input_times, input_values, input_exogenous_regressors)
       predicted_mean = prediction_ops["mean"]
       predicted_covariance = prediction_ops["covariance"]
       offset = self.output_window_size * gen_math_ops.minimum(
@@ -299,20 +357,33 @@ class ARModel(model.TimeSeriesModel):
         if self.output_window_size < self.input_window_size:
           new_input_values = array_ops.concat(
               [input_values[:, self.output_window_size:, :], predicted_mean], 1)
+          new_input_exogenous_regressors = array_ops.concat(
+              [input_exogenous_regressors[:, -self.input_window_size:, :],
+               exogenous_regressors[
+                   :, offset:offset + self.output_window_size, :]],
+              axis=1)
           new_input_times = array_ops.concat([
-              input_times[:, self.output_window_size:],
+              input_times[:, -self.input_window_size:],
               predict_times[:, offset:offset + self.output_window_size]
           ], 1)
         else:
           new_input_values = predicted_mean[:, -self.input_window_size:, :]
+          new_input_exogenous_regressors = exogenous_regressors[
+              :,
+              offset - self.input_window_size:offset + self.output_window_size,
+              :]
           new_input_times = predict_times[
               :,
               offset - self.input_window_size:offset + self.output_window_size]
       else:
         new_input_values = input_values
+        new_input_exogenous_regressors = exogenous_regressors[
+            :, offset:offset + self.output_window_size, :]
         new_input_times = predict_times[:,
                                         offset:offset + self.output_window_size]
       new_input_times.set_shape(initial_input_times.get_shape())
+      new_input_exogenous_regressors.set_shape(
+          initial_input_exogenous_regressors.get_shape())
       new_mean_ta = mean_ta.write(iteration_number, predicted_mean)
       if isinstance(covariance_ta, tensor_array_ops.TensorArray):
         new_covariance_ta = covariance_ta.write(iteration_number,
@@ -322,6 +393,7 @@ class ARModel(model.TimeSeriesModel):
       return (iteration_number + 1,
               new_input_times,
               new_input_values,
+              new_input_exogenous_regressors,
               new_mean_ta,
               new_covariance_ta)
 
@@ -332,9 +404,13 @@ class ARModel(model.TimeSeriesModel):
                           if self.loss != ARModel.SQUARED_LOSS else 0.)
     mean_ta_init = tensor_array_ops.TensorArray(
         dtype=self.dtype, size=prediction_iterations)
-    _, _, _, mean_ta, covariance_ta = control_flow_ops.while_loop(
+    _, _, _, _, mean_ta, covariance_ta = control_flow_ops.while_loop(
         _while_condition, _while_body, [
-            0, initial_input_times, initial_input_values, mean_ta_init,
+            0,
+            initial_input_times,
+            initial_input_values,
+            initial_input_exogenous_regressors,
+            mean_ta_init,
             covariance_ta_init
         ])
 
@@ -366,11 +442,11 @@ class ARModel(model.TimeSeriesModel):
     return {"mean": predicted_mean,
             "covariance": predicted_covariance}
 
-  def _process_window(self, features, mode):
+  def _process_window(self, features, mode, exogenous_regressors):
     """Compute model outputs on a single window of data."""
-    # TODO(agarwal): Use exogenous features
     times = math_ops.cast(features[TrainEvalFeatures.TIMES], dtypes.int64)
     values = math_ops.cast(features[TrainEvalFeatures.VALUES], dtype=self.dtype)
+    exogenous_regressors = math_ops.cast(exogenous_regressors, dtype=self.dtype)
     original_values = values
 
     # Extra shape checking for the window size (above that in
@@ -395,7 +471,8 @@ class ARModel(model.TimeSeriesModel):
       input_values = values[:, :self.input_window_size, :]
     else:
       input_values = None
-    prediction_ops = self.prediction_ops(times, input_values)
+    prediction_ops = self.prediction_ops(
+        times, input_values, exogenous_regressors)
     prediction = prediction_ops["mean"]
     covariance = prediction_ops["covariance"]
     targets = array_ops.slice(values, [0, self.input_window_size, 0],
@@ -419,7 +496,8 @@ class ARModel(model.TimeSeriesModel):
     return model.ModelOutputs(
         loss=loss,
         end_state=(times[:, -self.input_window_size:],
-                   values[:, -self.input_window_size:, :]),
+                   values[:, -self.input_window_size:, :],
+                   exogenous_regressors[:, -self.input_window_size:, :]),
         predictions={"mean": prediction, "covariance": covariance,
                      "observed": original_values[:, -self.output_window_size:]},
         prediction_times=times[:, -self.output_window_size:])
@@ -454,17 +532,24 @@ class ARModel(model.TimeSeriesModel):
     """
     features = {feature_name: ops.convert_to_tensor(feature_value)
                 for feature_name, feature_value in features.items()}
+    times = features[TrainEvalFeatures.TIMES]
+    exogenous_regressors = self._process_exogenous_features(
+        times=times,
+        features={key: value for key, value in features.items()
+                  if key not in [TrainEvalFeatures.TIMES,
+                                 TrainEvalFeatures.VALUES,
+                                 PredictionFeatures.STATE_TUPLE]})
     if mode == estimator_lib.ModeKeys.TRAIN:
       # For training, we require the window size to be self.window_size as
       # iterating sequentially on larger windows could introduce a bias.
-      return self._process_window(features, mode=mode)
+      return self._process_window(
+          features, mode=mode, exogenous_regressors=exogenous_regressors)
     elif mode == estimator_lib.ModeKeys.EVAL:
       # For evaluation, we allow the user to pass in a larger window, in which
       # case we try to cover as much of the window as possible without
       # overlap. Quantitative evaluation is more efficient/correct with fixed
       # windows matching self.window_size (as with training), but this looping
       # allows easy plotting of "in-sample" predictions.
-      times = features[TrainEvalFeatures.TIMES]
       times.get_shape().assert_has_rank(2)
       static_window_size = times.get_shape()[1].value
       if (static_window_size is not None
@@ -500,7 +585,9 @@ class ARModel(model.TimeSeriesModel):
                 feature_name:
                 feature_value[:, base_offset:base_offset + self.window_size]
                 for feature_name, feature_value in features.items()},
-            mode=mode)
+            mode=mode,
+            exogenous_regressors=exogenous_regressors[
+                :, base_offset:base_offset + self.window_size])
         # This code needs to be updated if new predictions are added in
         # self._process_window
         assert len(model_outputs.predictions) == 3
@@ -525,7 +612,9 @@ class ARModel(model.TimeSeriesModel):
       batch_size = array_ops.shape(times)[0]
       prediction_shape = [batch_size, self.output_window_size * num_iterations,
                           self.num_features]
-      previous_state_times, previous_state_values = state
+      (previous_state_times,
+       previous_state_values,
+       previous_state_exogenous_regressors) = state
       # Make sure returned state always has windows of self.input_window_size,
       # even if we were passed fewer than self.input_window_size points this
       # time.
@@ -540,14 +629,24 @@ class ARModel(model.TimeSeriesModel):
              self._scale_data(values)], axis=1)[:, -self.input_window_size:, :]
         new_state_values.set_shape((None, self.input_window_size,
                                     self.num_features))
+        new_exogenous_regressors = array_ops.concat(
+            [previous_state_exogenous_regressors,
+             exogenous_regressors], axis=1)[:, -self.input_window_size:, :]
+        new_exogenous_regressors.set_shape(
+            (None,
+             self.input_window_size,
+             self.exogenous_size))
       else:
         # There is no state to keep, and the strided slices above do not handle
         # input_window_size=0.
         new_state_times = previous_state_times
         new_state_values = previous_state_values
+        new_exogenous_regressors = previous_state_exogenous_regressors
       return model.ModelOutputs(
           loss=math_ops.reduce_mean(loss_ta.stack(), axis=0),
-          end_state=(new_state_times, new_state_values),
+          end_state=(new_state_times,
+                     new_state_values,
+                     new_exogenous_regressors),
           predictions={
               "mean": array_ops.reshape(
                   array_ops.transpose(mean_ta.stack(), [1, 0, 2, 3]),
@@ -604,7 +703,8 @@ class AnomalyMixtureARModel(ARModel):
                num_features,
                anomaly_distribution=GAUSSIAN_ANOMALY,
                num_time_buckets=10,
-               hidden_layer_sizes=None):
+               hidden_layer_sizes=None,
+               exogenous_feature_columns=None):
     assert (anomaly_prior_probability < 1.0 and
             anomaly_prior_probability > 0.0)
     self._anomaly_prior_probability = anomaly_prior_probability
@@ -619,7 +719,8 @@ class AnomalyMixtureARModel(ARModel):
         input_window_size=input_window_size,
         output_window_size=output_window_size,
         loss=ARModel.NORMAL_LIKELIHOOD_LOSS,
-        hidden_layer_sizes=hidden_layer_sizes)
+        hidden_layer_sizes=hidden_layer_sizes,
+        exogenous_feature_columns=exogenous_feature_columns)
 
   def _create_anomaly_ops(self, times, values, prediction_ops_dict):
     anomaly_log_param = variable_scope.get_variable(
@@ -631,9 +732,9 @@ class AnomalyMixtureARModel(ARModel):
     # distribution.
     prediction_ops_dict["anomaly_params"] = gen_math_ops.exp(anomaly_log_param)
 
-  def prediction_ops(self, times, values):
+  def prediction_ops(self, times, values, exogenous_regressors):
     prediction_ops_dict = super(AnomalyMixtureARModel, self).prediction_ops(
-        times, values)
+        times, values, exogenous_regressors)
     self._create_anomaly_ops(times, values, prediction_ops_dict)
     return prediction_ops_dict
 
diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py
index 1e1ca4e77f..d078ac8d46 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py
@@ -155,12 +155,15 @@ class ARModelTest(test.TestCase):
     state_times = np.expand_dims(train_data_times[:input_window_size], 0)
     state_values = np.expand_dims(
         train_data_values[:input_window_size, :], 0)
+    state_exogenous = state_times[:, :, None][:, :, :0]
 
     def prediction_input_fn():
       return ({
           PredictionFeatures.TIMES: training.limit_epochs(
               predict_times, num_epochs=1),
-          PredictionFeatures.STATE_TUPLE: (state_times, state_values)
+          PredictionFeatures.STATE_TUPLE: (state_times,
+                                           state_values,
+                                           state_exogenous)
       }, {})
     (predictions,) = tuple(estimator.predict(input_fn=prediction_input_fn))
     predicted_mean = predictions["mean"][:, 0]
@@ -246,7 +249,8 @@ class ARModelTest(test.TestCase):
       with session.Session():
         predicted_values = model.predict({
             PredictionFeatures.TIMES: [[4, 6, 10]],
-            PredictionFeatures.STATE_TUPLE: ([[1, 2]], [[[1.], [2.]]])
+            PredictionFeatures.STATE_TUPLE: (
+                [[1, 2]], [[[1.], [2.]]], [[[], []]])
         })
         variables.global_variables_initializer().run()
         self.assertAllEqual(predicted_values["mean"].eval().shape,
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators.py b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
index 886e1846e2..f4608ca2d1 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
@@ -190,7 +190,7 @@ class ARRegressor(TimeSeriesRegressor):
 
   def __init__(
       self, periodicities, input_window_size, output_window_size,
-      num_features, num_time_buckets=10,
+      num_features, exogenous_feature_columns=None, num_time_buckets=10,
       loss=ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS, hidden_layer_sizes=None,
       anomaly_prior_probability=None, anomaly_distribution=None,
       optimizer=None, model_dir=None, config=None):
@@ -205,7 +205,12 @@ class ARRegressor(TimeSeriesRegressor):
       output_window_size: Number of future time steps to predict. Note that
         setting it to > 1 empirically seems to give a better fit.
       num_features: The dimensionality of the time series (one for univariate,
-          more than one for multivariate).
+        more than one for multivariate).
+      exogenous_feature_columns: A list of `tf.feature_column`s (for example
+        `tf.feature_column.embedding_column`) corresponding to exogenous
+        features which provide extra information to the model but are not part
+        of the series to be predicted. Passed to
+        `tf.feature_column.input_layer`.
       num_time_buckets: Number of buckets into which to divide (time %
         periodicity) for generating time based features.
       loss: Loss function to use for training. Currently supported values are
@@ -241,6 +246,7 @@ class ARRegressor(TimeSeriesRegressor):
         anomaly_distribution = ar_model.AnomalyMixtureARModel.GAUSSIAN_ANOMALY
       model = ar_model.ARModel(
           periodicities=periodicities, num_features=num_features,
+          exogenous_feature_columns=exogenous_feature_columns,
           num_time_buckets=num_time_buckets,
           input_window_size=input_window_size,
           output_window_size=output_window_size, loss=loss,
@@ -255,6 +261,7 @@ class ARRegressor(TimeSeriesRegressor):
           input_window_size=input_window_size,
           output_window_size=output_window_size,
           num_features=num_features,
+          exogenous_feature_columns=exogenous_feature_columns,
           num_time_buckets=num_time_buckets,
           hidden_layer_sizes=hidden_layer_sizes,
           anomaly_prior_probability=anomaly_prior_probability,
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
index 9f161c1695..eebee053f8 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
@@ -29,6 +29,7 @@ from tensorflow.contrib.timeseries.python.timeseries import saved_model_utils
 
 from tensorflow.python.client import session
 from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
@@ -48,12 +49,17 @@ class TimeSeriesRegressorTest(test.TestCase):
   def _fit_restore_fit_test_template(self, estimator_fn, dtype):
     """Tests restoring previously fit models."""
     model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
-    first_estimator = estimator_fn(model_dir)
+    exogenous_feature_columns = (
+        feature_column.numeric_column("exogenous"),
+    )
+    first_estimator = estimator_fn(model_dir, exogenous_feature_columns)
     times = numpy.arange(20, dtype=numpy.int64)
     values = numpy.arange(20, dtype=dtype.as_numpy_dtype)
+    exogenous = numpy.arange(20, dtype=dtype.as_numpy_dtype)
     features = {
         feature_keys.TrainEvalFeatures.TIMES: times,
-        feature_keys.TrainEvalFeatures.VALUES: values
+        feature_keys.TrainEvalFeatures.VALUES: values,
+        "exogenous": exogenous
     }
     train_input_fn = input_pipeline.RandomWindowInputFn(
         input_pipeline.NumpyReader(features), shuffle_seed=2, num_threads=1,
@@ -68,14 +74,19 @@ class TimeSeriesRegressorTest(test.TestCase):
     first_loss_after_fit = first_estimator.evaluate(
         input_fn=eval_input_fn, steps=1)["loss"]
     self.assertLess(first_loss_after_fit, first_loss_before_fit)
-    second_estimator = estimator_fn(model_dir)
+    second_estimator = estimator_fn(model_dir, exogenous_feature_columns)
     second_estimator.train(input_fn=train_input_fn, steps=2)
     whole_dataset_input_fn = input_pipeline.WholeDatasetInputFn(
         input_pipeline.NumpyReader(features))
     whole_dataset_evaluation = second_estimator.evaluate(
         input_fn=whole_dataset_input_fn, steps=1)
+    exogenous_values_ten_steps = {
+        "exogenous": numpy.arange(
+            10, dtype=dtype.as_numpy_dtype)[None, :, None]
+    }
     predict_input_fn = input_pipeline.predict_continuation_input_fn(
         evaluation=whole_dataset_evaluation,
+        exogenous_features=exogenous_values_ten_steps,
         steps=10)
     # Also tests that limit_epochs in predict_continuation_input_fn prevents
     # infinite iteration
@@ -92,6 +103,7 @@ class TimeSeriesRegressorTest(test.TestCase):
         saved_prediction = saved_model_utils.predict_continuation(
             continue_from=whole_dataset_evaluation,
             steps=10,
+            exogenous_features=exogenous_values_ten_steps,
             signatures=signatures,
             session=sess)
         # Saved model predictions should be the same as Estimator predictions
@@ -104,7 +116,8 @@ class TimeSeriesRegressorTest(test.TestCase):
             continue_from=whole_dataset_evaluation,
             features={
                 feature_keys.FilteringFeatures.TIMES: times[None, -1] + 2,
-                feature_keys.FilteringFeatures.VALUES: values[None, -1] + 2.
+                feature_keys.FilteringFeatures.VALUES: values[None, -1] + 2.,
+                "exogenous": values[None, -1, None] + 12.
             },
             signatures=signatures,
             session=sess)
@@ -112,6 +125,10 @@ class TimeSeriesRegressorTest(test.TestCase):
         second_saved_prediction = saved_model_utils.predict_continuation(
             continue_from=first_filtering,
             steps=1,
+            exogenous_features={
+                "exogenous": numpy.arange(
+                    1, dtype=dtype.as_numpy_dtype)[None, :, None]
+            },
             signatures=signatures,
             session=sess)
         self.assertEqual(
@@ -122,7 +139,8 @@ class TimeSeriesRegressorTest(test.TestCase):
             continue_from=first_filtering,
             features={
                 feature_keys.FilteringFeatures.TIMES: times[-1] + 3,
-                feature_keys.FilteringFeatures.VALUES: values[-1] + 3.
+                feature_keys.FilteringFeatures.VALUES: values[-1] + 3.,
+                "exogenous": values[-1, None] + 13.
             },
             signatures=signatures,
             session=sess)
@@ -131,7 +149,8 @@ class TimeSeriesRegressorTest(test.TestCase):
         six.assertCountEqual(
             self,
             [feature_keys.FilteringFeatures.TIMES,
-             feature_keys.FilteringFeatures.VALUES],
+             feature_keys.FilteringFeatures.VALUES,
+             "exogenous"],
             signatures.signature_def[
                 feature_keys.SavedModelLabels.COLD_START_FILTER].inputs.keys())
         batch_numpy_times = numpy.tile(
@@ -142,7 +161,8 @@ class TimeSeriesRegressorTest(test.TestCase):
             session=sess,
             features={
                 feature_keys.FilteringFeatures.TIMES: batch_numpy_times,
-                feature_keys.FilteringFeatures.VALUES: batch_numpy_values
+                feature_keys.FilteringFeatures.VALUES: batch_numpy_values,
+                "exogenous": 10. + batch_numpy_values
             }
         )
         predict_times = numpy.tile(
@@ -150,26 +170,32 @@ class TimeSeriesRegressorTest(test.TestCase):
         predictions = saved_model_utils.predict_continuation(
             continue_from=state,
             times=predict_times,
+            exogenous_features={
+                "exogenous": numpy.tile(numpy.arange(
+                    15, dtype=dtype.as_numpy_dtype), (10,))[None, :, None]
+            },
             signatures=signatures,
             session=sess)
         self.assertAllEqual([10, 15, 1], predictions["mean"].shape)
 
   def test_fit_restore_fit_ar_regressor(self):
-    def _estimator_fn(model_dir):
+    def _estimator_fn(model_dir, exogenous_feature_columns):
       return estimators.ARRegressor(
           periodicities=10, input_window_size=10, output_window_size=6,
           num_features=1, model_dir=model_dir, config=_SeedRunConfig(),
           # This test is flaky with normal likelihood loss (could add more
           # training iterations instead).
-          loss=ar_model.ARModel.SQUARED_LOSS)
+          loss=ar_model.ARModel.SQUARED_LOSS,
+          exogenous_feature_columns=exogenous_feature_columns)
     self._fit_restore_fit_test_template(_estimator_fn, dtype=dtypes.float32)
 
   def test_fit_restore_fit_structural_ensemble_regressor(self):
     dtype = dtypes.float32
-    def _estimator_fn(model_dir):
+    def _estimator_fn(model_dir, exogenous_feature_columns):
       return estimators.StructuralEnsembleRegressor(
           num_features=1, periodicities=10, model_dir=model_dir, dtype=dtype,
-          config=_SeedRunConfig())
+          config=_SeedRunConfig(),
+          exogenous_feature_columns=exogenous_feature_columns)
     self._fit_restore_fit_test_template(_estimator_fn, dtype=dtype)
 
 
-- 
GitLab


From a36e6edab33c7a5bef2f911d4d7bb88ffc8c7de6 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Mon, 23 Apr 2018 16:51:59 -0700
Subject: [PATCH 335/450] Handle missing params for a few ops in Toco using
 default values.

PiperOrigin-RevId: 194007329
---
 .../contrib/lite/toco/import_tensorflow.cc    | 31 +++++++++++++------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 155d890c9f..2ed05cb372 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -1093,8 +1093,10 @@ void ConvertMatMulOperator(const NodeDef& node,
 
   // Transpose flags should be easy to support, but we don't have a
   // GraphDef with them to test on at the moment.
-  CHECK_EQ(GetBoolAttr(node, "transpose_a"), false);
-  CHECK_EQ(GetBoolAttr(node, "transpose_b"), false);
+  CHECK_EQ(HasAttr(node, "transpose_a") && GetBoolAttr(node, "transpose_a"),
+           false);
+  CHECK_EQ(HasAttr(node, "transpose_b") && GetBoolAttr(node, "transpose_b"),
+           false);
   CHECK(!HasAttr(node, "adjoint_a") ||
         (GetBoolAttr(node, "adjoint_a") == false));
   CHECK(!HasAttr(node, "adjoint_b") ||
@@ -1300,11 +1302,17 @@ void ConvertStridedSliceOperator(const NodeDef& node,
   }
   op->outputs.push_back(node.name());
 
-  op->begin_mask = GetIntAttr(node, "begin_mask");
-  op->ellipsis_mask = GetIntAttr(node, "ellipsis_mask");
-  op->end_mask = GetIntAttr(node, "end_mask");
-  op->new_axis_mask = GetIntAttr(node, "new_axis_mask");
-  op->shrink_axis_mask = GetIntAttr(node, "shrink_axis_mask");
+  op->begin_mask =
+      HasAttr(node, "begin_mask") ? GetIntAttr(node, "begin_mask") : 0;
+  op->ellipsis_mask =
+      HasAttr(node, "ellipsis_mask") ? GetIntAttr(node, "ellipsis_mask") : 0;
+  op->end_mask = HasAttr(node, "end_mask") ? GetIntAttr(node, "end_mask") : 0;
+  op->new_axis_mask =
+      HasAttr(node, "new_axis_mask") ? GetIntAttr(node, "new_axis_mask") : 0;
+  op->shrink_axis_mask = HasAttr(node, "shrink_axis_mask")
+                             ? GetIntAttr(node, "shrink_axis_mask")
+                             : 0;
+
   model->operators.emplace_back(op);
 }
 
@@ -1394,8 +1402,11 @@ void ConvertArgMaxOperator(const NodeDef& node,
                            Model* model) {
   CHECK_EQ(node.op(), "ArgMax");
   CheckInputsCount(node, tf_import_flags, 2);
-  const auto axis_data_type = GetDataTypeAttr(node, "Tidx");
-  const auto output_type = GetDataTypeAttr(node, "output_type");
+  const auto axis_data_type =
+      HasAttr(node, "Tidx") ? GetDataTypeAttr(node, "Tidx") : DT_INT32;
+  const auto output_type = HasAttr(node, "output_type")
+                               ? GetDataTypeAttr(node, "output_type")
+                               : DT_INT64;
   CHECK(axis_data_type == DT_INT64 || axis_data_type == DT_INT32);
   CHECK(output_type == DT_INT64 || output_type == DT_INT32);
   auto* op = new ArgMaxOperator;
@@ -1772,7 +1783,7 @@ void ConvertStackOperator(const NodeDef& node,
     op->inputs.push_back(node.input(i));
   }
   // Both "Stack" and "Pack" have the "axis" attribute.
-  op->axis = GetIntAttr(node, "axis");
+  op->axis = HasAttr(node, "axis") ? GetIntAttr(node, "axis") : 0;
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
 }
-- 
GitLab


From 771f7b46d631fa510658685d1b84ffbb22ffcd55 Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Mon, 23 Apr 2018 17:10:05 -0700
Subject: [PATCH 336/450] Improve TOCO SavedModel support.

PiperOrigin-RevId: 194009891
---
 tensorflow/contrib/lite/python/BUILD          |  45 +-
 tensorflow/contrib/lite/python/convert.py     | 187 +++++++++
 .../lite/python/convert_saved_model.py        | 387 ++++++++++++------
 .../lite/python/convert_saved_model_test.py   | 172 ++++++--
 .../convert_saved_model_to_frozen_graph.py    | 106 +++++
 .../python/{lite_test.py => convert_test.py}  |  41 +-
 tensorflow/contrib/lite/python/lite.py        | 204 +--------
 .../contrib/lite/python/lite_constants.py     |  53 +++
 8 files changed, 828 insertions(+), 367 deletions(-)
 create mode 100644 tensorflow/contrib/lite/python/convert.py
 create mode 100644 tensorflow/contrib/lite/python/convert_saved_model_to_frozen_graph.py
 rename tensorflow/contrib/lite/python/{lite_test.py => convert_test.py} (82%)
 create mode 100644 tensorflow/contrib/lite/python/lite_constants.py

diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index 926896d609..e6dcc7aa09 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -39,16 +39,35 @@ py_test(
 py_library(
     name = "lite",
     srcs = ["lite.py"],
-    # data = [
-    #     "//tensorflow/contrib/lite/toco/python:toco_from_protos",
-    # ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":convert",
+        ":convert_saved_model",
         ":op_hint",
+    ],
+)
+
+py_library(
+    name = "lite_constants",
+    srcs = ["lite_constants.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/lite/toco:toco_flags_proto_py",
+    ],
+)
+
+py_library(
+    name = "convert",
+    srcs = ["convert.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":lite_constants",
         "//tensorflow/contrib/lite/toco:model_flags_proto_py",
         "//tensorflow/contrib/lite/toco:toco_flags_proto_py",
         "//tensorflow/contrib/lite/toco/python:tensorflow_wrap_toco",
+        "//tensorflow/contrib/lite/toco/python:toco_from_protos",
         "//tensorflow/python:platform",
     ],
 )
@@ -66,15 +85,15 @@ py_library(
 )
 
 py_test(
-    name = "lite_test",
-    srcs = ["lite_test.py"],
+    name = "convert_test",
+    srcs = ["convert_test.py"],
     srcs_version = "PY2AND3",
     tags = [
         "no-internal-py3",
         "no_oss",
     ],
     deps = [
-        ":lite",
+        ":convert",
         ":op_hint",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -84,13 +103,14 @@ py_test(
     ],
 )
 
-py_binary(
+py_library(
     name = "convert_saved_model",
     srcs = ["convert_saved_model.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        ":lite",
+        ":convert",
+        ":lite_constants",
         "//tensorflow/contrib/saved_model:saved_model_py",
         "//tensorflow/python:graph_util",
         "//tensorflow/python/tools:freeze_graph_lib",
@@ -130,6 +150,15 @@ py_test(
     ],
 )
 
+py_binary(
+    name = "convert_saved_model_to_frozen_graph",
+    srcs = ["convert_saved_model_to_frozen_graph.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":convert_saved_model",
+    ],
+)
+
 # Transitive dependencies of this target will be included in the pip package.
 py_library(
     name = "tf_lite_py_pip",
diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py
new file mode 100644
index 0000000000..c4200c879b
--- /dev/null
+++ b/tensorflow/contrib/lite/python/convert.py
@@ -0,0 +1,187 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converts a frozen graph into a TFLite FlatBuffer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os as _os
+import subprocess as _subprocess
+import tempfile as _tempfile
+
+from tensorflow.contrib.lite.python import lite_constants
+from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2
+from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
+from tensorflow.python.framework import dtypes as _dtypes
+from tensorflow.python.platform import resource_loader as _resource_loader
+from tensorflow.python.util.lazy_loader import LazyLoader
+
+
+# Lazy load since some of the performance benchmark skylark rules
+# break dependencies.
+_toco_python = LazyLoader(
+    "tensorflow_wrap_toco", globals(),
+    "tensorflow.contrib.lite.toco.python."
+    "tensorflow_wrap_toco")
+del LazyLoader
+
+# Find the toco_from_protos binary using the resource loader if using from
+# bazel, otherwise we are in a pip where console_scripts already has
+# the toco_from_protos tool.
+if lite_constants.EXPERIMENTAL_USE_TOCO_API_DIRECTLY:
+  _toco_from_proto_bin = ""
+else:
+  _toco_from_proto_bin = _resource_loader.get_path_to_datafile(
+      "../toco/python/toco_from_protos")
+
+if _toco_from_proto_bin and not _os.path.exists(_toco_from_proto_bin):
+  _toco_from_proto_bin = "toco_from_protos"
+
+
+def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str):
+  """Convert `input_data_str` according to model and toco parameters.
+
+  Unless you know what you are doing consider using
+  the more friendly @{tf.contrib.lite.toco_convert}}.
+
+  Args:
+    model_flags_str: Serialized proto describing model properties, see
+      `toco/model_flags.proto`.
+    toco_flags_str: Serialized proto describing conversion properties, see
+      `toco/toco_flags.proto`.
+    input_data_str: Input data in serialized form (e.g. a graphdef is common)
+  Returns:
+    Converted model in serialized form (e.g. a TFLITE model is common).
+  Raises:
+    RuntimeError: When conversion fails, an exception is raised with the error
+      message embedded.
+  """
+  # TODO(aselle): When toco does not use fatal errors for failure, we can
+  # switch this on.
+  if not _toco_from_proto_bin:
+    return _toco_python.TocoConvert(
+        model_flags_str, toco_flags_str, input_data_str)
+
+  with _tempfile.NamedTemporaryFile() as fp_toco, \
+           _tempfile.NamedTemporaryFile() as fp_model, \
+           _tempfile.NamedTemporaryFile() as fp_input, \
+           _tempfile.NamedTemporaryFile() as fp_output:
+    fp_model.write(model_flags_str)
+    fp_toco.write(toco_flags_str)
+    fp_input.write(input_data_str)
+    fp_model.flush()
+    fp_toco.flush()
+    fp_input.flush()
+
+    cmd = [
+        _toco_from_proto_bin, fp_model.name, fp_toco.name, fp_input.name,
+        fp_output.name
+    ]
+    cmdline = " ".join(cmd)
+    proc = _subprocess.Popen(
+        cmdline,
+        shell=True,
+        stdout=_subprocess.PIPE,
+        stderr=_subprocess.STDOUT,
+        close_fds=True)
+    stdout, stderr = proc.communicate()
+    exitcode = proc.returncode
+    if exitcode == 0:
+      stuff = fp_output.read()
+      return stuff
+    else:
+      raise RuntimeError("TOCO failed see console for info.\n%s\n%s\n" %
+                         (stdout, stderr))
+
+
+def tensor_name(x):
+  return x.name.split(":")[0]
+
+
+def toco_convert(input_data,
+                 input_tensors,
+                 output_tensors,
+                 inference_type=lite_constants.FLOAT,
+                 input_format=lite_constants.TENSORFLOW_GRAPHDEF,
+                 output_format=lite_constants.TFLITE,
+                 quantized_input_stats=None,
+                 drop_control_dependency=True):
+  """Convert a model using TOCO from `input_format` to `output_format`.
+
+  Typically this is to convert from TensorFlow GraphDef to TFLite, in which
+  case the default `input_format` and `output_format` are sufficient.
+
+  Args:
+    input_data: Input data (i.e. often `sess.graph_def`).
+    input_tensors: List of input tensors. Type and shape are computed using
+      `foo.get_shape()` and `foo.dtype`.
+    output_tensors: List of output tensors (only .name is used from this).
+    inference_type: Currently must be `{FLOAT, QUANTIZED_UINT8}`.
+    input_format: Type of data to read (currently must be TENSORFLOW_GRAPHDEF).
+    output_format: Type of data to write (currently must be TFLITE or
+      GRAPHVIZ_DOT)
+    quantized_input_stats: For each member of input_tensors the mean and
+      std deviation of training data. Only needed if `inference_type` is
+      `QUANTIZED_UINT8`.
+    drop_control_dependency: Drops control dependencies silently. This is due
+      to tf lite not supporting control dependencies.
+
+  Returns:
+    The converted data. For example if tflite was the destination, then
+    this will be a tflite flatbuffer in a bytes array.
+
+  Raises:
+    ValueError: If the input tensor type is unknown
+    RuntimeError: If TOCO fails to convert (in which case the runtime error's
+      error text will contain the TOCO error log)
+  """
+  toco = _toco_flags_pb2.TocoFlags()
+  toco.input_format = input_format
+  toco.output_format = output_format
+  toco.drop_control_dependency = drop_control_dependency
+  model = _model_flags_pb2.ModelFlags()
+  toco.inference_type = inference_type
+  for idx, input_tensor in enumerate(input_tensors):
+    if input_tensor.dtype == _dtypes.float32:
+      tflite_input_type = lite_constants.FLOAT
+    elif input_tensor.dtype == _dtypes.int32:
+      tflite_input_type = lite_constants.INT32
+    elif input_tensor.dtype == _dtypes.int64:
+      tflite_input_type = lite_constants.INT64
+    # TODO(aselle): Insert strings when they are available
+    else:
+      raise ValueError("Tensors %s not known type %r" % (input_tensor.name,
+                                                         input_tensor.dtype))
+
+    input_array = model.input_arrays.add()
+
+    if inference_type == lite_constants.QUANTIZED_UINT8:
+      if tflite_input_type == lite_constants.FLOAT:
+        tflite_input_type = lite_constants.QUANTIZED_UINT8
+      input_array.mean_value, input_array.std_value = quantized_input_stats[idx]
+
+    input_array.name = tensor_name(input_tensor)
+    input_array.shape.dims.extend(map(int, input_tensor.get_shape()))
+
+  for output_tensor in output_tensors:
+    model.output_arrays.append(tensor_name(output_tensor))
+
+  # TODO(aselle): Consider handling the case of allowing quantized
+  # inputs to be converted to float (via the toco.inference_input_type field).
+  data = toco_convert_protos(model.SerializeToString(),
+                             toco.SerializeToString(),
+                             input_data.SerializeToString())
+  return data
diff --git a/tensorflow/contrib/lite/python/convert_saved_model.py b/tensorflow/contrib/lite/python/convert_saved_model.py
index a2b5ef488e..a7eddf3408 100644
--- a/tensorflow/contrib/lite/python/convert_saved_model.py
+++ b/tensorflow/contrib/lite/python/convert_saved_model.py
@@ -12,52 +12,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-r"""TensorFlow Lite flatbuffer generation from saved_models.
+"""Functions to convert SavedModel to frozen GraphDefs."""
 
-Example:
-
-bazel run third_party/tensorflow/contrib/lite/python:convert_saved_model -- \
-  --saved_model_dir=/tmp/test_saved_model/1519865537 \
-  --output_tflite=/tmp/test.lite
-
-"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.lite.python import lite
+from tensorflow.contrib.lite.python import convert
+from tensorflow.contrib.lite.python import lite_constants
+from tensorflow.contrib.lite.toco import model_flags_pb2
 from tensorflow.contrib.saved_model.python.saved_model import reader
 from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils
 from tensorflow.core.framework import types_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import graph_util as tf_graph_util
 from tensorflow.python.framework import ops
-from tensorflow.python.platform import app
-from tensorflow.python.platform import flags
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
 
-flags.DEFINE_string("saved_model_dir", "", "Saved model directory to convert.")
-flags.DEFINE_string("output_tflite", None, "File path to write flatbuffer.")
-flags.DEFINE_string("output_arrays", None,
-                    "List of output tensor names, the default value is None, "
-                    "which means the conversion will keep all outputs.")
-flags.DEFINE_integer("batch_size", 1,
-                     "If input tensor shape has None at first dimension, "
-                     "e.g. (None,224,224,3), replace None with batch_size.")
-flags.DEFINE_string("tag_set", tag_constants.SERVING,
-                    "Group of tag(s) of the MetaGraphDef in the saved_model, "
-                    "in string format, separated by ','. For tag-set contains "
-                    "multiple tags, all tags must be passed in.")
-flags.DEFINE_string("signature_key",
-                    signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
-                    "This is signature key to extract inputs, outputs.")
-
-
-def log_tensor_details(tensor_info):
+
+def _write_and_flush_file(file_path, data_str):
+  """Writes data to file path.
+
+  Args:
+    file_path: Full path of the file to store data in.
+    data_str: Data represented as a string.
+
+  Returns: None.
+  """
+  with gfile.Open(file_path, "wb") as data_file:
+    data_file.write(data_str)
+    data_file.flush()
+
+
+def _log_tensor_details(tensor_info):
   """Log tensor details: name, shape, and type."""
   for key in tensor_info:
     val = tensor_info[key]
@@ -73,7 +64,7 @@ def log_tensor_details(tensor_info):
                  dtype)
 
 
-def get_meta_graph_def(saved_model_dir, tag_set):
+def _get_meta_graph_def(saved_model_dir, tag_set):
   """Validate saved_model and extract MetaGraphDef.
 
   Args:
@@ -103,7 +94,7 @@ def get_meta_graph_def(saved_model_dir, tag_set):
                      "values are '{}'. ".format(tag_set, tag_sets))
 
 
-def get_signature_def(meta_graph, signature_key):
+def _get_signature_def(meta_graph, signature_key):
   """Get the signature def from meta_graph with given signature_key.
 
   Args:
@@ -130,11 +121,11 @@ def get_signature_def(meta_graph, signature_key):
   return signature_def
 
 
-def get_inputs_outputs(signature_def):
-  """Get inputs and outputs from signature def.
+def _get_inputs_outputs(signature_def):
+  """Get inputs and outputs from SignatureDef.
 
   Args:
-    signature_def: signatuer def in the meta_graph_def for conversion.
+    signature_def: SignatureDef in the meta_graph_def for conversion.
 
   Returns:
     The inputs and outputs in the graph for conversion.
@@ -142,9 +133,9 @@ def get_inputs_outputs(signature_def):
   inputs_tensor_info = signature_def.inputs
   outputs_tensor_info = signature_def.outputs
   logging.info("input tensors info: ")
-  log_tensor_details(inputs_tensor_info)
+  _log_tensor_details(inputs_tensor_info)
   logging.info("output tensors info: ")
-  log_tensor_details(outputs_tensor_info)
+  _log_tensor_details(outputs_tensor_info)
 
   def gather_names(tensor_info):
     return [tensor_info[key].name for key in tensor_info]
@@ -154,109 +145,277 @@ def get_inputs_outputs(signature_def):
   return inputs, outputs
 
 
-def convert(saved_model_dir,
-            output_tflite=None,
-            output_arrays=None,
-            tag_set=None,
-            signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
-            batch_size=1):
-  """Convert a saved_model to tflite flatbuffer.
+def _get_tensors(graph, signature_def_tensor_names=None,
+                 user_tensor_names=None):
+  """Gets the tensors associated with the tensor names.
+
+  Either signature_def_tensor_names or user_tensor_names should be provided. If
+  the user provides tensors, the tensors associated with the user provided
+  tensor names are provided. Otherwise, the tensors associated with the names in
+  the SignatureDef are provided.
 
   Args:
-    saved_model_dir: Saved model directory to convert.
-    output_tflite: File path to write result flatbuffer.
-    output_arrays: List of output tensor names, the default value is None, which
-      means conversion keeps all output tensors. This is also used to filter
-      tensors that are from Op currently not supported in tflite, e.g., Argmax).
-    tag_set: This is the set of tags to get meta_graph_def in saved_model.
-    signature_key: This is the signature key to extract inputs, outputs.
-    batch_size: If input tensor shape has None at first dimension,
-      e.g. (None,224,224,3), replace None with batch_size.
+    graph: GraphDef representing graph.
+    signature_def_tensor_names: Tensor names stored in either the inputs or
+      outputs of a SignatureDef. (default None)
+    user_tensor_names: Tensor names provided by the user. (default None)
 
   Returns:
-    The converted data. For example if tflite was the destination, then
-    this will be a tflite flatbuffer in a bytes array.
+    List of tensors.
+
+  Raises:
+    ValueError:
+      signature_def_tensors and user_tensor_names are undefined or empty.
+      user_tensor_names are not valid.
+  """
+  tensors = []
+  if user_tensor_names:
+    # Get the list of all of the tensors with and without the tensor index.
+    all_tensor_names = [
+        tensor.name for op in graph.get_operations() for tensor in op.outputs
+    ]
+    all_tensor_names_only = [name.split(":")[0] for name in all_tensor_names]
+
+    # Sort the tensor names.
+    user_tensor_names = sorted(user_tensor_names)
+
+    # Get the tensors associated with the tensor names.
+    tensors = []
+    invalid_tensors = []
+    for name in user_tensor_names:
+      if name not in all_tensor_names_only:
+        invalid_tensors.append(name)
+      else:
+        idx = all_tensor_names_only.index(name)
+        tensors.append(graph.get_tensor_by_name(all_tensor_names[idx]))
+
+    # Throw ValueError if any user input names are not valid tensors.
+    if invalid_tensors:
+      raise ValueError("Invalid tensors '{}' were found.".format(
+          ",".join(invalid_tensors)))
+  elif signature_def_tensor_names:
+    tensors = [
+        graph.get_tensor_by_name(name)
+        for name in sorted(signature_def_tensor_names)
+    ]
+  else:
+    # Throw ValueError if signature_def_tensors and user_tensor_names are both
+    # either undefined or empty.
+    raise ValueError(
+        "Specify either signature_def_tensor_names or user_tensor_names")
+
+  return tensors
+
+
+def _freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
+                        output_arrays, tag_set, signature_key, batch_size):
+  """Converts a SavedModel to a frozen graph.
+
+  Args:
+    saved_model_dir: SavedModel directory to convert.
+    input_arrays: List of input tensors to freeze graph with. Uses input arrays
+      from SignatureDef when none are provided. (default None)
+    input_shapes: Map of strings representing input tensor names to list of
+      integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}).
+      Automatically determined when input shapes is None (e.g., {"foo" : None}).
+      (default None)
+    output_arrays: List of output tensors to freeze graph with. Uses output
+      arrays from SignatureDef when none are provided. (default None)
+    tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to
+      analyze. All tags in the tag set must be present. (default "serve")
+    signature_key: Key identifying SignatureDef containing inputs and outputs.
+    batch_size: Batch size for the model. Replaces the first dimension of an
+      input size array if undefined. (default 1)
+
+  Returns:
+    frozen_graph_def: Frozen GraphDef.
+    in_tensors: List of input tensors for the graph.
+    out_tensors: List of output tensors for the graph.
 
   Raises:
-    ValueError: If tag_set does not indicate any meta_graph_def in saved_model,
-      or signature_key is not in relevant meta_graph_def,
-      or input shape has None beyond 1st dimension, e.g., (1,None, None, 3),
-      or given output_arrays are not valid causing empty outputs.
+    ValueError:
+      SavedModel doesn't contain a MetaGraphDef identified by tag_set.
+      signature_key is not in the MetaGraphDef.
+      input_shapes does not match the length of input_arrays.
+      input_shapes has a None value after the 1st dimension.
+      input_arrays or output_arrays are not valid.
+      Unable to load Session.
   """
+  # Set default values for inputs if they are set to None.
+  if signature_key is None:
+    signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
   if tag_set is None:
     tag_set = set([tag_constants.SERVING])
+  if batch_size is None:
+    batch_size = 1
 
-  meta_graph = get_meta_graph_def(saved_model_dir, tag_set)
-  signature_def = get_signature_def(meta_graph, signature_key)
-  inputs, outputs = get_inputs_outputs(signature_def)
+  # Read SignatureDef.
+  meta_graph = _get_meta_graph_def(saved_model_dir, tag_set)
+  signature_def = _get_signature_def(meta_graph, signature_key)
+  inputs, outputs = _get_inputs_outputs(signature_def)
 
   graph = ops.Graph()
   with session.Session(graph=graph) as sess:
-
+    # TODO(nupurgarg): Throw ValueError if SavedModel has assets/ directory.
     loader.load(sess, meta_graph.meta_info_def.tags, saved_model_dir)
 
-    in_tensors = [graph.get_tensor_by_name(input_) for input_ in inputs]
-
-    # Users can use output_arrays to filter output tensors for conversion.
-    # If output_arrays is None, we keep all output tensors. In future, we may
-    # use tflite supported Op list and check whether op is custom Op to
-    # automatically filter output arrays.
-    # TODO(zhixianyan): Use tflite supported Op list to filter outputs.
-    if output_arrays is not None:
-      output_arrays = output_arrays.split(",")
-      out_tensors = [
-          graph.get_tensor_by_name(output)
-          for output in outputs
-          if output.split(":")[0] in output_arrays
-      ]
-    else:
-      out_tensors = [graph.get_tensor_by_name(output) for output in outputs]
+    # Gets input and output tensors.
+    # TODO(zhixianyan): Use TFLite supported Op list to filter outputs.
+    in_tensors = _get_tensors(graph, inputs, input_arrays)
+    out_tensors = _get_tensors(graph, outputs, output_arrays)
 
-    output_names = [node.split(":")[0] for node in outputs]
+    # Gets fully defined tensor shape. An input tensor with None in the first
+    # dimension, e.g. (None, 224, 224, 3), is replaced with the batch_size.
+    # Shapes with None after the first dimension result in a ValueError.
+    # TODO(zhixianyan): Add supports for input tensor with more None in shape.
+    for tensor in in_tensors:
+      if (input_shapes and tensor.name in input_shapes and
+          input_shapes[tensor.name] is not None):
+        shape = input_shapes[tensor.name]
+      else:
+        shape = tensor.get_shape().as_list()
 
-    if not out_tensors:
-      raise ValueError(
-          "No valid output tensors for '{}', possible values are '{}'".format(
-              output_arrays, output_names))
+      if None in shape[1:]:
+        raise ValueError(
+            "None is only supported in the 1st dimension. Tensor '{0}' has "
+            "invalid shape '{1}'.".format(tensor.name, shape))
+      elif shape[0] is None:
+        shape[0] = batch_size
+      tensor.set_shape(shape)
 
+    output_names = [node.split(":")[0] for node in outputs]
     frozen_graph_def = tf_graph_util.convert_variables_to_constants(
         sess, graph.as_graph_def(), output_names)
 
-    # Toco requires fully defined tensor shape, for input tensor with None in
-    # their shape, e.g., (None, 224, 224, 3), we need to replace first None with
-    # a given batch size. For shape with more None, e.g. (None, None, None, 3),
-    # still be able to replace and convert, but require further investigation.
-    # TODO(zhixianyan): Add supports for input tensor with more None in shape.
-    for i in range(len(in_tensors)):
-      shape = in_tensors[i].get_shape().as_list()
-      if shape[0] is None:
-        shape[0] = batch_size
-      if None in shape[1:]:
-        raise ValueError(
-            "Only support None shape at 1st dim as batch_size. But tensor "
-            "'{}' 's shape '{}' has None at other dimension. ".format(
-                inputs[i], shape))
-      in_tensors[i].set_shape(shape)
+    return frozen_graph_def, in_tensors, out_tensors
+  raise ValueError("Unable to load Session.")
 
-    result = lite.toco_convert(frozen_graph_def, in_tensors, out_tensors)
 
-    if output_tflite is not None:
-      with gfile.Open(output_tflite, "wb") as f:
-        f.write(result)
-      logging.info("Successfully converted to: %s", output_tflite)
+def saved_model_to_frozen_graphdef(
+    saved_model_dir,
+    output_file_model,
+    output_file_flags,
+    input_arrays=None,
+    input_shapes=None,
+    output_arrays=None,
+    tag_set=None,
+    signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
+    batch_size=1):
+  """Converts a SavedModel to a frozen graph. Writes graph to tmp directory.
 
-    return result
+  Stores frozen graph and command line flags in the tmp directory.
 
+  Args:
+    saved_model_dir: SavedModel directory to convert.
+    output_file_model: Full file path to save frozen graph.
+    output_file_flags: Full file path to save ModelFlags.
+    input_arrays: List of input tensors to freeze graph with. Uses input arrays
+      from SignatureDef when none are provided. (default None)
+    input_shapes: Map of strings representing input tensor names to list of
+      integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}).
+      Automatically determined when input shapes is None (e.g., {"foo" : None}).
+      (default None)
+    output_arrays: List of output tensors to freeze graph with. Uses output
+      arrays from SignatureDef when none are provided. (default None)
+    tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to
+      analyze. All tags in the tag set must be present. (default "serve")
+    signature_key: Key identifying SignatureDef containing inputs and outputs.
+    batch_size: Batch size for the model. Replaces the first dimension of an
+      input size array if undefined. (default 1)
+
+  Returns: None.
 
-def main(_):
-  convert(
-      saved_model_dir=flags.FLAGS.saved_model_dir,
-      output_tflite=flags.FLAGS.output_tflite,
-      output_arrays=flags.FLAGS.output_arrays,
-      batch_size=flags.FLAGS.batch_size,
-      tag_set=set(flags.FLAGS.tag_set.split(",")),
-      signature_key=flags.FLAGS.signature_key)
+  Raises:
+    ValueError: Unable to convert to frozen graph.
+  """
+  frozen_graph_def, in_tensors, out_tensors = _freeze_saved_model(
+      saved_model_dir, input_arrays, input_shapes, output_arrays, tag_set,
+      signature_key, batch_size)
+
+  # Initialize model flags.
+  model = model_flags_pb2.ModelFlags()
+
+  for input_tensor in in_tensors:
+    input_array = model.input_arrays.add()
+    input_array.name = convert.tensor_name(input_tensor)
+    input_array.shape.dims.extend(map(int, input_tensor.get_shape()))
+
+  for output_tensor in out_tensors:
+    model.output_arrays.append(convert.tensor_name(output_tensor))
+
+  # Write model and ModelFlags to file. ModelFlags contain input array and
+  # output array information that is parsed from the SignatureDef and used for
+  # analysis by TOCO.
+  _write_and_flush_file(output_file_model, frozen_graph_def.SerializeToString())
+  _write_and_flush_file(output_file_flags, model.SerializeToString())
+
+
+def tflite_from_saved_model(
+    saved_model_dir,
+    output_file=None,
+    input_arrays=None,
+    input_shapes=None,
+    output_arrays=None,
+    tag_set=None,
+    signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
+    batch_size=1,
+    inference_type=lite_constants.FLOAT,
+    input_format=lite_constants.TENSORFLOW_GRAPHDEF,
+    output_format=lite_constants.TFLITE,
+    quantized_input_stats=None,
+    drop_control_dependency=True):
+  """Converts a SavedModel to TFLite FlatBuffer.
 
+  Args:
+    saved_model_dir: SavedModel directory to convert.
+    output_file: File path to write result TFLite FlatBuffer.
+    input_arrays: List of input tensors to freeze graph with. Uses input arrays
+      from SignatureDef when none are provided. (default None)
+    input_shapes: Map of strings representing input tensor names to list of
+      integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}).
+      Automatically determined when input shapes is None (e.g., {"foo" : None}).
+      (default None)
+    output_arrays: List of output tensors to freeze graph with. Uses output
+      arrays from SignatureDef when none are provided. (default None)
+    tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to
+      analyze. All tags in the tag set must be present. (default "serve")
+    signature_key: Key identifying SignatureDef containing inputs and outputs.
+    batch_size: Batch size for the model. Replaces the first dimension of an
+      input size array if undefined. (default 1)
+    inference_type: Currently must be `{FLOAT, QUANTIZED_UINT8}`.
+    input_format: Type of data to read (currently must be TENSORFLOW_GRAPHDEF).
+    output_format: Type of data to write (currently must be TFLITE or
+      GRAPHVIZ_DOT)
+    quantized_input_stats: For each member of input_tensors the mean and
+      std deviation of training data. Only needed if `inference_type` is
+      `QUANTIZED_UINT8`.
+    drop_control_dependency: Drops control dependencies silently. This is due
+      to tf lite not supporting control dependencies.
 
-if __name__ == "__main__":
-  app.run(main)
+  Returns:
+    The converted data. For example if tflite was the destination, then
+    this will be a tflite flatbuffer in a bytes array.
+
+  Raises:
+    ValueError: Unable to convert to frozen graph.
+  """
+  frozen_graph_def, in_tensors, out_tensors = _freeze_saved_model(
+      saved_model_dir, input_arrays, input_shapes, output_arrays, tag_set,
+      signature_key, batch_size)
+
+  result = convert.toco_convert(
+      input_data=frozen_graph_def,
+      input_tensors=in_tensors,
+      output_tensors=out_tensors,
+      inference_type=inference_type,
+      input_format=input_format,
+      output_format=output_format,
+      quantized_input_stats=quantized_input_stats,
+      drop_control_dependency=drop_control_dependency)
+
+  if output_file is not None:
+    with gfile.Open(output_file, "wb") as f:
+      f.write(result)
+    logging.info("Successfully converted to: %s", output_file)
+
+  return result
diff --git a/tensorflow/contrib/lite/python/convert_saved_model_test.py b/tensorflow/contrib/lite/python/convert_saved_model_test.py
index 734e42d619..db95fc8ad7 100644
--- a/tensorflow/contrib/lite/python/convert_saved_model_test.py
+++ b/tensorflow/contrib/lite/python/convert_saved_model_test.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TF Lite SavedModel Conversion test cases.
-
- - test on generated saved_models from simple graphs (sanity check)
- - test mnist savedmodel generated on-the-fly
+"""TFLite SavedModel conversion test cases.
 
+  - Tests converting simple SavedModel graph to TFLite FlatBuffer.
+  - Tests converting simple SavedModel graph to frozen graph.
+  - Tests converting MNIST SavedModel to TFLite FlatBuffer.
 """
 
 from __future__ import absolute_import
@@ -25,6 +25,7 @@ from __future__ import print_function
 
 import os
 from tensorflow.contrib.lite.python import convert_saved_model
+from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2
 from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.estimator import estimator_lib as estimator
@@ -37,6 +38,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import saved_model
 from tensorflow.python.training import training as train
@@ -45,7 +47,7 @@ from tensorflow.python.training import training as train
 class ConvertSavedModelTestBasicGraph(test_util.TensorFlowTestCase):
 
   def _createSimpleSavedModel(self, shape):
-    """Create a simple savedmodel on the fly."""
+    """Create a simple SavedModel on the fly."""
     saved_model_dir = os.path.join(self.get_temp_dir(), "simple_savedmodel")
     with session.Session() as sess:
       in_tensor = array_ops.placeholder(shape=shape, dtype=dtypes.float32)
@@ -56,44 +58,78 @@ class ConvertSavedModelTestBasicGraph(test_util.TensorFlowTestCase):
     return saved_model_dir
 
   def testSimpleSavedModel(self):
-    """Test a simple savedmodel created on the fly."""
-    # Create a simple savedmodel
+    """Test a simple SavedModel created on the fly."""
+    # Create a simple SavedModel
     saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
     # Convert to tflite
-    result = convert_saved_model.convert(saved_model_dir=saved_model_dir)
+    result = convert_saved_model.tflite_from_saved_model(
+        saved_model_dir=saved_model_dir)
     self.assertTrue(result)
 
   def testSimpleSavedModelWithNoneBatchSizeInShape(self):
-    """Test a simple savedmodel, with None in input tensor's shape."""
+    """Test a simple SavedModel, with None in input tensor's shape."""
     saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, 16, 3])
-    result = convert_saved_model.convert(saved_model_dir=saved_model_dir)
+    result = convert_saved_model.tflite_from_saved_model(
+        saved_model_dir=saved_model_dir)
     self.assertTrue(result)
 
   def testSimpleSavedModelWithMoreNoneInShape(self):
-    """Test a simple savedmodel, fail as more None in input shape."""
+    """Test a simple SavedModel, fail as more None in input shape."""
     saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, None, 3])
     # Convert to tflite: this should raise ValueError, as 3rd dim is None.
     with self.assertRaises(ValueError):
-      convert_saved_model.convert(saved_model_dir=saved_model_dir)
+      convert_saved_model.tflite_from_saved_model(
+          saved_model_dir=saved_model_dir)
 
   def testSimpleSavedModelWithWrongSignatureKey(self):
-    """Test a simple savedmodel, fail as given signature is invalid."""
+    """Test a simple SavedModel, fail as given signature is invalid."""
     saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
     # Convert to tflite: this should raise ValueError, as
     # signature_key does not exit in the saved_model.
     with self.assertRaises(ValueError):
-      convert_saved_model.convert(
+      convert_saved_model.tflite_from_saved_model(
           saved_model_dir=saved_model_dir, signature_key="wrong-key")
 
   def testSimpleSavedModelWithWrongOutputArray(self):
-    """Test a simple savedmodel, fail as given output_arrays is invalid."""
-    # Create a simple savedmodel
+    """Test a simple SavedModel, fail as given output_arrays is invalid."""
+    # Create a simple SavedModel
     saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
     # Convert to tflite: this should raise ValueError, as
     # output_arrays is not valid for the saved_model.
     with self.assertRaises(ValueError):
-      convert_saved_model.convert(
-          saved_model_dir=saved_model_dir, output_arrays="wrong-output")
+      convert_saved_model.tflite_from_saved_model(
+          saved_model_dir=saved_model_dir, output_arrays=["wrong-output"])
+
+  def testSimpleSavedModelWithWrongInputArrays(self):
+    """Test a simple SavedModel, fail as given input_arrays is invalid."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    # Checks invalid input_arrays.
+    with self.assertRaises(ValueError):
+      convert_saved_model.tflite_from_saved_model(
+          saved_model_dir=saved_model_dir, input_arrays=["wrong-input"])
+    # Checks valid and invalid input_arrays.
+    with self.assertRaises(ValueError):
+      convert_saved_model.tflite_from_saved_model(
+          saved_model_dir=saved_model_dir,
+          input_arrays=["Placeholder", "wrong-input"])
+
+  def testSimpleSavedModelWithCorrectArrays(self):
+    """Test a simple SavedModel, with correct input_arrays and output_arrays."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, 16, 3])
+    result = convert_saved_model.tflite_from_saved_model(
+        saved_model_dir=saved_model_dir,
+        input_arrays=["Placeholder"],
+        output_arrays=["add"])
+    self.assertTrue(result)
+
+  def testSimpleSavedModelWithCorrectInputArrays(self):
+    """Test a simple SavedModel, with correct input_arrays and input_shapes."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    result = convert_saved_model.tflite_from_saved_model(
+        saved_model_dir=saved_model_dir,
+        input_arrays=["Placeholder"],
+        input_shapes={"Placeholder": [1, 16, 16, 3]})
+    self.assertTrue(result)
 
   def testMultipleMetaGraphDef(self):
     """Test saved model with multiple MetaGraphDef."""
@@ -119,20 +155,103 @@ class ConvertSavedModelTestBasicGraph(test_util.TensorFlowTestCase):
           sess,
           tags=[saved_model.tag_constants.SERVING, "additional_test_tag"],
           signature_def_map=signature_def_map)
+
       # MetaGraphDef 2
       builder.add_meta_graph(tags=["tflite"])
       builder.save(True)
 
     # Convert to tflite
-    convert_saved_model.convert(
+    convert_saved_model.tflite_from_saved_model(
         saved_model_dir=saved_model_dir,
         tag_set=set([saved_model.tag_constants.SERVING, "additional_test_tag"]))
 
 
+class ConvertSavedModelTestBasicGraphToText(test_util.TensorFlowTestCase):
+
+  def _createSimpleSavedModel(self, shape):
+    """Create a simple SavedModel."""
+    saved_model_dir = os.path.join(self.get_temp_dir(), "simple_savedmodel")
+    with session.Session() as sess:
+      in_tensor_1 = array_ops.placeholder(
+          shape=shape, dtype=dtypes.float32, name="inputB")
+      in_tensor_2 = array_ops.placeholder(
+          shape=shape, dtype=dtypes.float32, name="inputA")
+      out_tensor = in_tensor_1 + in_tensor_2
+      inputs = {"x": in_tensor_1, "y": in_tensor_2}
+      outputs = {"z": out_tensor}
+      saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
+    return saved_model_dir
+
+  def _getInputArrayNames(self, model_proto):
+    return [data.name for data in model_proto.input_arrays]
+
+  def _getInputArrayShapes(self, model_proto):
+    return [
+        [dim for dim in data.shape.dims] for data in model_proto.input_arrays
+    ]
+
+  def _get_model_flags_proto_from_file(self, filename):
+    proto = _model_flags_pb2.ModelFlags()
+    with gfile.Open(filename, "rb") as output_file:
+      proto.ParseFromString(output_file.read())
+      output_file.close()
+    return proto
+
+  def testSimpleSavedModel(self):
+    """Test a simple SavedModel."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    output_file_model = os.path.join(self.get_temp_dir(), "model.pb")
+    output_file_flags = os.path.join(self.get_temp_dir(), "model.pbtxt")
+
+    convert_saved_model.saved_model_to_frozen_graphdef(
+        saved_model_dir=saved_model_dir,
+        output_file_model=output_file_model,
+        output_file_flags=output_file_flags,
+        input_arrays=["inputB", "inputA"])
+
+    proto = self._get_model_flags_proto_from_file(output_file_flags)
+    self.assertEqual(proto.output_arrays, ["add"])
+    self.assertEqual(self._getInputArrayNames(proto), ["inputA", "inputB"])
+    self.assertEqual(
+        self._getInputArrayShapes(proto), [[1, 16, 16, 3], [1, 16, 16, 3]])
+
+  def testSimpleSavedModelWithDifferentInputNames(self):
+    """Test a simple SavedModel."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    output_file_model = os.path.join(self.get_temp_dir(), "model.pb")
+    output_file_flags = os.path.join(self.get_temp_dir(), "model.pbtxt")
+
+    # Check case where input shape is given.
+    convert_saved_model.saved_model_to_frozen_graphdef(
+        saved_model_dir=saved_model_dir,
+        output_file_model=output_file_model,
+        output_file_flags=output_file_flags,
+        input_arrays=["inputA"],
+        input_shapes={"inputA": [1, 16, 16, 3]})
+
+    proto = self._get_model_flags_proto_from_file(output_file_flags)
+    self.assertEqual(proto.output_arrays, ["add"])
+    self.assertEqual(self._getInputArrayNames(proto), ["inputA"])
+    self.assertEqual(self._getInputArrayShapes(proto), [[1, 16, 16, 3]])
+
+    # Check case where input shape is None.
+    convert_saved_model.saved_model_to_frozen_graphdef(
+        saved_model_dir=saved_model_dir,
+        output_file_model=output_file_model,
+        output_file_flags=output_file_flags,
+        input_arrays=["inputA"],
+        input_shapes={"inputA": None})
+
+    proto = self._get_model_flags_proto_from_file(output_file_flags)
+    self.assertEqual(proto.output_arrays, ["add"])
+    self.assertEqual(self._getInputArrayNames(proto), ["inputA"])
+    self.assertEqual(self._getInputArrayShapes(proto), [[1, 16, 16, 3]])
+
+
 class Model(keras.Model):
   """Model to recognize digits in the MNIST dataset.
 
-  Train and export savedmodel, used for testOnflyTrainMnistSavedModel
+  Train and export SavedModel, used for testOnflyTrainMnistSavedModel
 
   Network structure is equivalent to:
   https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/examples/tutorials/mnist/mnist_deep.py
@@ -238,7 +357,7 @@ def dummy_input_fn():
 class ConvertSavedModelTestTrainGraph(test_util.TensorFlowTestCase):
 
   def testTrainedMnistSavedModel(self):
-    """Test mnist savedmodel, trained with dummy data and small steps."""
+    """Test mnist SavedModel, trained with dummy data and small steps."""
     # Build classifier
     classifier = estimator.Estimator(
         model_fn=model_fn,
@@ -253,21 +372,20 @@ class ConvertSavedModelTestTrainGraph(test_util.TensorFlowTestCase):
         "image": image,
     })
 
-    # Export savedmodel
+    # Export SavedModel
     saved_model_dir = os.path.join(self.get_temp_dir(), "mnist_savedmodel")
     classifier.export_savedmodel(saved_model_dir, pred_input_fn)
 
     # Convert to tflite and test output
     saved_model_name = os.listdir(saved_model_dir)[0]
     saved_model_final_dir = os.path.join(saved_model_dir, saved_model_name)
-    output_tflite = os.path.join(saved_model_dir,
-                                 saved_model_final_dir + ".lite")
+    output_file = os.path.join(saved_model_dir, saved_model_final_dir + ".lite")
     # TODO(zhixianyan): no need to limit output_arrays to `Softmax'
     # once b/74205001 fixed and argmax implemented in tflite.
-    result = convert_saved_model.convert(
+    result = convert_saved_model.tflite_from_saved_model(
         saved_model_dir=saved_model_final_dir,
-        output_arrays="Softmax",
-        output_tflite=output_tflite)
+        output_arrays=["Softmax"],
+        output_file=output_file)
 
     self.assertTrue(result)
 
diff --git a/tensorflow/contrib/lite/python/convert_saved_model_to_frozen_graph.py b/tensorflow/contrib/lite/python/convert_saved_model_to_frozen_graph.py
new file mode 100644
index 0000000000..4d9782f4a6
--- /dev/null
+++ b/tensorflow/contrib/lite/python/convert_saved_model_to_frozen_graph.py
@@ -0,0 +1,106 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python console command for generating frozen models from SavedModels.
+
+This exists to add SavedModel compatibility to TOCO.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+from tensorflow.contrib.lite.python.convert_saved_model import saved_model_to_frozen_graphdef
+from tensorflow.python.platform import app
+
+FLAGS = None
+
+
+def execute(unused_args):
+  """Calls function to convert the SavedModel to a frozen graph."""
+  # Error handling.
+  if FLAGS.input_shapes and not FLAGS.input_arrays:
+    raise ValueError("Input shapes requires input arrays to be specified.")
+
+  # Calls saved_model_to_frozen_graphdef function to generate frozen graph.
+  input_arrays = (FLAGS.input_arrays.split(",") if FLAGS.input_arrays else None)
+  input_shapes = None
+  if FLAGS.input_shapes:
+    input_shapes = {
+        input_arrays[idx]: shape.split(",")
+        for idx, shape in enumerate(FLAGS.input_shapes.split(":"))
+    }
+  output_arrays = (
+      FLAGS.output_arrays.split(",") if FLAGS.output_arrays else None)
+  tag_set = set(FLAGS.tag_set.split(",")) if FLAGS.tag_set else None
+
+  saved_model_to_frozen_graphdef(
+      saved_model_dir=FLAGS.saved_model_directory,
+      output_file_model=FLAGS.output_file_model,
+      output_file_flags=FLAGS.output_file_flags,
+      input_arrays=input_arrays,
+      input_shapes=input_shapes,
+      output_arrays=output_arrays,
+      tag_set=tag_set,
+      signature_key=FLAGS.signature_key,
+      batch_size=FLAGS.batch_size)
+
+
+def main():
+  global FLAGS
+  # Parses flags.
+  parser = argparse.ArgumentParser(
+      description="Invoke SavedModel to frozen model converter.")
+  parser.add_argument(
+      "saved_model_directory",
+      type=str,
+      help="Full path to directory containing the SavedModel.")
+  parser.add_argument(
+      "output_file_model",
+      type=str,
+      help="Full file path to save frozen graph.")
+  parser.add_argument(
+      "output_file_flags", type=str, help="Full file path to save ModelFlags.")
+  parser.add_argument(
+      "--input_arrays",
+      type=str,
+      help="Name of the input arrays, comma-separated.")
+  parser.add_argument(
+      "--input_shapes",
+      type=str,
+      help="Shapes corresponding to --input_arrays, colon-separated.")
+  parser.add_argument(
+      "--output_arrays",
+      type=str,
+      help="Name of the output arrays, comma-separated.")
+  parser.add_argument(
+      "--tag_set", type=str, help="Name of output arrays, comma-separated.")
+  parser.add_argument(
+      "--signature_key",
+      type=str,
+      help="Key identifying SignatureDef containing inputs and outputs.")
+  parser.add_argument(
+      "--batch_size",
+      type=int,
+      help="Batch size for the model. Replaces the first dimension of an "
+      "input size array if undefined.")
+
+  FLAGS, unparsed = parser.parse_known_args()
+
+  app.run(main=execute, argv=[sys.argv[0]] + unparsed)
+
+
+if __name__ == "__main__":
+  main()
diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/convert_test.py
similarity index 82%
rename from tensorflow/contrib/lite/python/lite_test.py
rename to tensorflow/contrib/lite/python/convert_test.py
index b8b4510188..dc21a9b669 100644
--- a/tensorflow/contrib/lite/python/lite_test.py
+++ b/tensorflow/contrib/lite/python/convert_test.py
@@ -17,8 +17,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.lite.python import lite
-from tensorflow.contrib.lite.python.op_hint import _tensor_name_base as _tensor_name_base
+from tensorflow.contrib.lite.python import convert
+from tensorflow.contrib.lite.python import lite_constants
+from tensorflow.contrib.lite.python import op_hint
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
@@ -29,7 +30,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-class LiteTest(test_util.TensorFlowTestCase):
+class ConvertTest(test_util.TensorFlowTestCase):
 
   def testBasic(self):
     in_tensor = array_ops.placeholder(shape=[1, 16, 16, 3],
@@ -37,13 +38,13 @@ class LiteTest(test_util.TensorFlowTestCase):
     out_tensor = in_tensor + in_tensor
     sess = session.Session()
     # Try running on valid graph
-    result = lite.toco_convert(sess.graph_def, [in_tensor], [out_tensor])
+    result = convert.toco_convert(sess.graph_def, [in_tensor], [out_tensor])
     self.assertTrue(result)
     # TODO(aselle): remove tests that fail (we must get TOCO to not fatal
     # all the time).
     # Try running on identity graph (known fail)
     # with self.assertRaisesRegexp(RuntimeError, "!model->operators.empty()"):
-    #   result = lite.toco_convert(sess.graph_def, [in_tensor], [in_tensor])
+    #   result = convert.toco_convert(sess.graph_def, [in_tensor], [in_tensor])
 
   def testQuantization(self):
     in_tensor = array_ops.placeholder(shape=[1, 16, 16, 3],
@@ -51,13 +52,14 @@ class LiteTest(test_util.TensorFlowTestCase):
     out_tensor = array_ops.fake_quant_with_min_max_args(in_tensor + in_tensor,
                                                         min=0., max=1.)
     sess = session.Session()
-    result = lite.toco_convert(sess.graph_def, [in_tensor], [out_tensor],
-                               inference_type=lite.QUANTIZED_UINT8,
-                               quantized_input_stats=[(0., 1.)])
+    result = convert.toco_convert(
+        sess.graph_def, [in_tensor], [out_tensor],
+        inference_type=lite_constants.QUANTIZED_UINT8,
+        quantized_input_stats=[(0., 1.)])
     self.assertTrue(result)
 
 
-class LiteTestOpHint(test_util.TensorFlowTestCase):
+class ConvertTestOpHint(test_util.TensorFlowTestCase):
   """Test the hint to stub functionality."""
 
   def _getGraphOpTypes(self, graphdef, output_nodes):
@@ -99,7 +101,7 @@ class LiteTestOpHint(test_util.TensorFlowTestCase):
     swish_scale = array_ops.constant(1.0)
 
     def _swish(input_tensor, scale):
-      custom = lite.OpHint("cool_activation")
+      custom = op_hint.OpHint("cool_activation")
       input_tensor, scale = custom.add_inputs(input_tensor, scale)
       output = math_ops.sigmoid(input_tensor) * input_tensor * scale
       output, = custom.add_outputs(output)
@@ -111,11 +113,12 @@ class LiteTestOpHint(test_util.TensorFlowTestCase):
       # and 1 final output).
       self.assertEqual(self._countIdentities(sess.graph_def.node), 4)
 
-      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess)
 
       self.assertCountEqual(
           self._getGraphOpTypes(
-              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+              stubbed_graphdef,
+              output_nodes=[op_hint._tensor_name_base(output)]),
           ["cool_activation", "Const", "Identity"])
 
   def testScaleAndBiasAndIdentity(self):
@@ -125,7 +128,7 @@ class LiteTestOpHint(test_util.TensorFlowTestCase):
     b = array_ops.constant([4., 5.])
 
     def _scaled_and_bias_and_identity(a, x, b):
-      custom = lite.OpHint("scale_and_bias_and_identity")
+      custom = op_hint.OpHint("scale_and_bias_and_identity")
       a, x, b = custom.add_inputs(a, x, b)
       return custom.add_outputs(a * x + b, x)
     output = array_ops.identity(_scaled_and_bias_and_identity(a, x, b),
@@ -136,11 +139,12 @@ class LiteTestOpHint(test_util.TensorFlowTestCase):
       # +1 for the final output
       self.assertEqual(self._countIdentities(sess.graph_def.node), 6)
 
-      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess)
 
       self.assertCountEqual(
           self._getGraphOpTypes(
-              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+              stubbed_graphdef,
+              output_nodes=[op_hint._tensor_name_base(output)]),
           ["scale_and_bias_and_identity", "Const", "Identity", "Pack"])
 
   def testTwoFunctions(self):
@@ -148,7 +152,7 @@ class LiteTestOpHint(test_util.TensorFlowTestCase):
     a = array_ops.constant([1.])
     b = array_ops.constant([1.])
     def _double_values(x):
-      custom = lite.OpHint("add_test")
+      custom = op_hint.OpHint("add_test")
       x = custom.add_inputs(x)
       output = math_ops.multiply(x, x)
       output, = custom.add_outputs(output)
@@ -160,10 +164,11 @@ class LiteTestOpHint(test_util.TensorFlowTestCase):
       # make sure one identity for each input (2) and output (2) => 2 + 2
       # +1 for the final output
       self.assertEqual(self._countIdentities(sess.graph_def.node), 5)
-      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess)
       self.assertCountEqual(
           self._getGraphOpTypes(
-              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+              stubbed_graphdef,
+              output_nodes=[op_hint._tensor_name_base(output)]),
           ["add_test", "Const", "Identity", "Add"])
 
 
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index cf50f9d4d6..4ea40201f7 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -18,6 +18,7 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice.
 
 @@toco_convert
 @@toco_convert_protos
+@@tflite_from_saved_model
 @@OpHint
 @@convert_op_hints_to_stubs
 
@@ -25,208 +26,11 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import os as _os
-import subprocess as _subprocess
-import tempfile as _tempfile
 
 # pylint: disable=unused-import
+from tensorflow.contrib.lite.python.convert import toco_convert
+from tensorflow.contrib.lite.python.convert import toco_convert_protos
+from tensorflow.contrib.lite.python.convert_saved_model import tflite_from_saved_model
 from tensorflow.contrib.lite.python.op_hint import convert_op_hints_to_stubs
 from tensorflow.contrib.lite.python.op_hint import OpHint
 # pylint: enable=unused-import
-from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2
-from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
-from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2
-from tensorflow.python.framework import dtypes as _dtypes
-from tensorflow.python.platform import resource_loader as _resource_loader
-from tensorflow.python.util.all_util import remove_undocumented
-from tensorflow.python.util.lazy_loader import LazyLoader
-
-# Lazy load since some of the performance benchmark skylark rules
-# break dependencies.
-_toco_python = LazyLoader(
-    "tensorflow_wrap_toco", globals(),
-    "tensorflow.contrib.lite.toco.python."
-    "tensorflow_wrap_toco")
-del LazyLoader
-
-# Enum types from the protobuf promoted to the API
-FLOAT = _types_pb2.FLOAT
-INT32 = _types_pb2.INT32
-INT64 = _types_pb2.INT64
-STRING = _types_pb2.STRING
-QUANTIZED_UINT8 = _types_pb2.QUANTIZED_UINT8
-TENSORFLOW_GRAPHDEF = _toco_flags_pb2.TENSORFLOW_GRAPHDEF
-TFLITE = _toco_flags_pb2.TFLITE
-GRAPHVIZ_DOT = _toco_flags_pb2.GRAPHVIZ_DOT
-
-# Currently the default mode of operation is to shell to another python process
-# to protect against crashes. However, it breaks some dependent targets because
-# it forces us to depend on an external py_binary. The experimental API doesn't
-# have that drawback.
-EXPERIMENTAL_USE_TOCO_API_DIRECTLY = False
-
-# Find the toco_from_protos binary using the resource loader if using from
-# bazel, otherwise we are in a pip where console_scripts already has
-# the toco_from_protos tool.
-if EXPERIMENTAL_USE_TOCO_API_DIRECTLY:
-  _toco_from_proto_bin = ""
-else:
-  _toco_from_proto_bin = _resource_loader.get_path_to_datafile(
-      "../toco/python/toco_from_protos")
-
-if _toco_from_proto_bin and not _os.path.exists(_toco_from_proto_bin):
-  _toco_from_proto_bin = "toco_from_protos"
-
-
-def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str):
-  """Convert `input_data_str` according to model and toco parameters.
-
-  Unless you know what you are doing consider using
-  the more friendly @{tf.contrib.lite.toco_convert}}.
-
-  Args:
-    model_flags_str: Serialized proto describing model properties, see
-      `toco/model_flags.proto`.
-    toco_flags_str: Serialized proto describing conversion properties, see
-      `toco/toco_flags.proto`.
-    input_data_str: Input data in serialized form (e.g. a graphdef is common)
-  Returns:
-    Converted model in serialized form (e.g. a TFLITE model is common).
-  Raises:
-    RuntimeError: When conversion fails, an exception is raised with the error
-      message embedded.
-  """
-  # TODO(aselle): When toco does not use fatal errors for failure, we can
-  # switch this on.
-  if not _toco_from_proto_bin:
-    return _toco_python.TocoConvert(
-        model_flags_str, toco_flags_str, input_data_str)
-
-  with _tempfile.NamedTemporaryFile() as fp_toco, \
-           _tempfile.NamedTemporaryFile() as fp_model, \
-           _tempfile.NamedTemporaryFile() as fp_input, \
-           _tempfile.NamedTemporaryFile() as fp_output:
-    fp_model.write(model_flags_str)
-    fp_toco.write(toco_flags_str)
-    fp_input.write(input_data_str)
-    fp_model.flush()
-    fp_toco.flush()
-    fp_input.flush()
-
-    cmd = [
-        _toco_from_proto_bin, fp_model.name, fp_toco.name, fp_input.name,
-        fp_output.name
-    ]
-    cmdline = " ".join(cmd)
-    proc = _subprocess.Popen(
-        cmdline,
-        shell=True,
-        stdout=_subprocess.PIPE,
-        stderr=_subprocess.STDOUT,
-        close_fds=True)
-    stdout, stderr = proc.communicate()
-    exitcode = proc.returncode
-    if exitcode == 0:
-      stuff = fp_output.read()
-      return stuff
-    else:
-      raise RuntimeError("TOCO failed see console for info.\n%s\n%s\n" %
-                         (stdout, stderr))
-
-
-def _tensor_name(x):
-  return x.name.split(":")[0]
-
-
-def toco_convert(input_data,
-                 input_tensors,
-                 output_tensors,
-                 inference_type=FLOAT,
-                 input_format=TENSORFLOW_GRAPHDEF,
-                 output_format=TFLITE,
-                 quantized_input_stats=None,
-                 drop_control_dependency=True,
-                 allow_custom_ops=None):
-  """Convert a model using TOCO from `input_format` to `output_format`.
-
-  Typically this is to convert from TensorFlow GraphDef to TFLite, in which
-  case the default `input_format` and `output_format` are sufficient.
-
-  Args:
-    input_data: Input data (i.e. often `sess.graph_def`).
-    input_tensors: List of input tensors. Type and shape are computed using
-      `foo.get_shape()` and `foo.dtype`.
-    output_tensors: List of output tensors (only .name is used from this).
-    inference_type: Currently must be `{FLOAT, QUANTIZED_UINT8}`.
-    input_format: Type of data to read (currently must be TENSORFLOW_GRAPHDEF).
-    output_format: Type of data to write (currently must be TFLITE or
-      GRAPHVIZ_DOT)
-    quantized_input_stats: For each member of input_tensors the mean and
-      std deviation of training data. Only needed if `inference_type` is
-      `QUANTIZED_UINT8`.
-    drop_control_dependency: Drops control dependencies silently. This is due
-      to tf lite not supporting control dependencies.
-
-  Returns:
-    The converted data. For example if tflite was the destination, then
-    this will be a tflite flatbuffer in a bytes array.
-
-  Raises:
-    ValueError: If the input tensor type is unknown
-    RuntimeError: If TOCO fails to convert (in which case the runtime error's
-      error text will contain the TOCO error log)
-  """
-  toco = _toco_flags_pb2.TocoFlags()
-  toco.input_format = input_format
-  toco.output_format = output_format
-  toco.inference_type = inference_type
-  toco.drop_control_dependency = drop_control_dependency
-  if allow_custom_ops is not None:
-    toco.allow_custom_ops = allow_custom_ops
-
-  model = _model_flags_pb2.ModelFlags()
-  for idx, input_tensor in enumerate(input_tensors):
-    if input_tensor.dtype == _dtypes.float32:
-      tflite_input_type = FLOAT
-    elif input_tensor.dtype == _dtypes.int32:
-      tflite_input_type = INT32
-    elif input_tensor.dtype == _dtypes.int64:
-      tflite_input_type = INT64
-    # TODO(aselle): Insert strings when they are available
-    else:
-      raise ValueError("Tensors %s not known type %r" % (input_tensor.name,
-                                                         input_tensor.dtype))
-
-    input_array = model.input_arrays.add()
-
-    if inference_type == QUANTIZED_UINT8:
-      if tflite_input_type == FLOAT:
-        tflite_input_type = QUANTIZED_UINT8
-      input_array.mean_value, input_array.std_value = quantized_input_stats[idx]
-
-    input_array.name = _tensor_name(input_tensor)
-    input_array.shape.dims.extend(map(int, input_tensor.get_shape()))
-
-  for output_tensor in output_tensors:
-    model.output_arrays.append(_tensor_name(output_tensor))
-
-  # TODO(aselle): Consider handling the case of allowing quantized
-  # inputs to be converted to float (via the toco.inference_input_type field).
-  data = toco_convert_protos(model.SerializeToString(),
-                             toco.SerializeToString(),
-                             input_data.SerializeToString())
-  return data
-
-
-_allowed_symbols = [
-    "FLOAT",
-    "INT32",
-    "INT64",
-    "STRING",
-    "QUANTIZED_UINT8",
-    "TENSORFLOW_GRAPHDEF",
-    "TFLITE",
-    "GRAPHVIZ_DOT",
-    "EXPERIMENTAL_USE_TOCO_API_DIRECTLY",
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/lite/python/lite_constants.py b/tensorflow/contrib/lite/python/lite_constants.py
new file mode 100644
index 0000000000..195d7a732f
--- /dev/null
+++ b/tensorflow/contrib/lite/python/lite_constants.py
@@ -0,0 +1,53 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Constants for TFLite."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
+from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2
+from tensorflow.python.util.all_util import remove_undocumented
+
+# Enum types from the protobuf promoted to the API
+FLOAT = _types_pb2.FLOAT
+INT32 = _types_pb2.INT32
+INT64 = _types_pb2.INT64
+STRING = _types_pb2.STRING
+QUANTIZED_UINT8 = _types_pb2.QUANTIZED_UINT8
+TENSORFLOW_GRAPHDEF = _toco_flags_pb2.TENSORFLOW_GRAPHDEF
+TFLITE = _toco_flags_pb2.TFLITE
+GRAPHVIZ_DOT = _toco_flags_pb2.GRAPHVIZ_DOT
+
+# Currently the default mode of operation is to shell to another python process
+# to protect against crashes. However, it breaks some dependent targets because
+# it forces us to depend on an external py_binary. The experimental API doesn't
+# have that drawback.
+EXPERIMENTAL_USE_TOCO_API_DIRECTLY = False
+
+
+_allowed_symbols = [
+    "FLOAT",
+    "INT32",
+    "INT64",
+    "STRING",
+    "QUANTIZED_UINT8",
+    "TENSORFLOW_GRAPHDEF",
+    "TFLITE",
+    "GRAPHVIZ_DOT",
+    "EXPERIMENTAL_USE_TOCO_API_DIRECTLY",
+]
+remove_undocumented(__name__, _allowed_symbols)
-- 
GitLab


From ecd837fd0ab69cf54d920eae3b1c73602be6c626 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 17:14:16 -0700
Subject: [PATCH 337/450] [TF:XLA] Add a kernel for PlaceholderWithDefault

PiperOrigin-RevId: 194010395
---
 tensorflow/compiler/tests/BUILD               | 12 +++++
 tensorflow/compiler/tests/placeholder_test.py | 48 +++++++++++++++++++
 .../compiler/tf2xla/kernels/identity_op.cc    |  1 +
 3 files changed, 61 insertions(+)
 create mode 100644 tensorflow/compiler/tests/placeholder_test.py

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index ac2441cea0..0c72093256 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -923,3 +923,15 @@ tf_xla_py_test(
         "//tensorflow/python:platform_test",
     ],
 )
+
+tf_xla_py_test(
+    name = "placeholder_test",
+    size = "small",
+    srcs = ["placeholder_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
diff --git a/tensorflow/compiler/tests/placeholder_test.py b/tensorflow/compiler/tests/placeholder_test.py
new file mode 100644
index 0000000000..5e6d1313bd
--- /dev/null
+++ b/tensorflow/compiler/tests/placeholder_test.py
@@ -0,0 +1,48 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for xla handling of placeholder_with_default."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+
+class PlaceholderTest(XLATestCase):
+
+  def test_placeholder_with_default_default(self):
+    with self.test_session() as sess, self.test_scope():
+      v = resource_variable_ops.ResourceVariable(4.0)
+      ph = array_ops.placeholder_with_default(v, shape=[])
+      out = ph * 2
+      sess.run(variables.variables_initializer([v]))
+      self.assertEqual(8.0, sess.run(out))
+
+  def test_placeholder_with_default_fed(self):
+    with self.test_session() as sess, self.test_scope():
+      v = resource_variable_ops.ResourceVariable(4.0)
+      ph = array_ops.placeholder_with_default(v, shape=[])
+      out = ph * 2
+      sess.run(variables.variables_initializer([v]))
+      self.assertEqual(2.0, sess.run(out, {ph: 1.0}))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/compiler/tf2xla/kernels/identity_op.cc b/tensorflow/compiler/tf2xla/kernels/identity_op.cc
index 39af662b63..e72200bfbc 100644
--- a/tensorflow/compiler/tf2xla/kernels/identity_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/identity_op.cc
@@ -38,6 +38,7 @@ class IdentityOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("Identity").CompilationOnly(), IdentityOp);
 
 REGISTER_XLA_OP(Name("IdentityN").CompilationOnly(), IdentityOp);
+REGISTER_XLA_OP(Name("PlaceholderWithDefault"), IdentityOp);
 REGISTER_XLA_OP(Name("PreventGradient"), IdentityOp);
 REGISTER_XLA_OP(Name("StopGradient"), IdentityOp);
 REGISTER_XLA_OP(Name("Snapshot"), IdentityOp);
-- 
GitLab


From 80fc661853f9a0844faf95eb68438dc85a5879e3 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Mon, 23 Apr 2018 17:16:55 -0700
Subject: [PATCH 338/450] Use tensorflow::se instead of perftools::gputools for
 StreamExecutor.

PiperOrigin-RevId: 194010749
---
 tensorflow/compiler/aot/compile.cc            |  5 +-
 .../compiler/jit/kernels/xla_launch_op.cc     | 12 ++--
 .../compiler/jit/kernels/xla_launch_op.h      |  2 +-
 .../compiler/jit/xla_compile_on_demand_op.cc  |  2 +-
 tensorflow/compiler/jit/xla_device.cc         |  2 -
 tensorflow/compiler/jit/xla_device.h          | 13 ++--
 tensorflow/compiler/jit/xla_device_context.cc |  2 -
 tensorflow/compiler/jit/xla_device_context.h  | 15 ++---
 tensorflow/compiler/jit/xla_launch_util.cc    | 26 ++++----
 tensorflow/compiler/jit/xla_launch_util.h     | 13 ++--
 tensorflow/compiler/jit/xla_tensor.cc         |  9 ++-
 tensorflow/compiler/jit/xla_tensor.h          |  3 +-
 .../fused_conv2d_bias_activation_op.cc        |  2 +-
 .../kernels/adjust_hsv_in_yiq_op_gpu.cu.cc    |  2 +-
 .../mpi_collectives/kernels/mpi_ops.cc        |  2 +-
 tensorflow/contrib/mpi_collectives/mpi_ops.cc |  2 +-
 .../contrib/nccl/kernels/nccl_manager.cc      | 56 ++++++++---------
 .../contrib/nccl/kernels/nccl_manager.h       | 36 +++++------
 .../contrib/nccl/kernels/nccl_manager_test.cc |  8 +--
 tensorflow/contrib/rnn/kernels/blas_gemm.cc   | 11 ++--
 .../contrib/tensorrt/kernels/trt_engine_op.cc |  1 -
 .../common_runtime/gpu/gpu_bfc_allocator.h    |  8 +--
 .../gpu/gpu_cudamalloc_allocator.h            |  2 +-
 .../common_runtime/gpu/gpu_debug_allocator.cc |  6 +-
 .../common_runtime/gpu/gpu_debug_allocator.h  |  4 +-
 .../core/common_runtime/gpu/gpu_device.cc     |  5 +-
 .../core/common_runtime/gpu/gpu_event_mgr.cc  | 22 +++----
 .../core/common_runtime/gpu/gpu_event_mgr.h   | 30 ++++-----
 .../common_runtime/gpu/gpu_event_mgr_test.cc  | 19 +++---
 .../core/common_runtime/gpu/gpu_init.cc       |  8 +--
 .../core/common_runtime/gpu/gpu_util.cc       | 20 +++---
 tensorflow/core/common_runtime/gpu/gpu_util.h |  5 +-
 .../core/common_runtime/gpu/pool_allocator.h  |  4 +-
 .../common_runtime/gpu/pool_allocator_test.cc | 32 +++++-----
 .../core/common_runtime/gpu_device_context.h  |  4 +-
 tensorflow/core/grappler/devices.cc           | 12 ++--
 tensorflow/core/kernels/avgpooling_op.cc      | 24 +++----
 .../core/kernels/batch_matmul_op_impl.h       | 44 ++++++-------
 tensorflow/core/kernels/bias_op.cc            |  4 +-
 tensorflow/core/kernels/check_numerics_op.cc  |  6 +-
 .../core/kernels/conv_grad_filter_ops.cc      | 32 +++++-----
 .../core/kernels/conv_grad_input_ops.cc       | 28 ++++-----
 tensorflow/core/kernels/conv_grad_ops_3d.cc   | 62 +++++++++----------
 tensorflow/core/kernels/conv_ops.cc           | 24 +++----
 tensorflow/core/kernels/conv_ops_3d.cc        | 26 ++++----
 tensorflow/core/kernels/conv_ops_gpu.h        | 26 ++++----
 tensorflow/core/kernels/crop_and_resize_op.cc |  8 +--
 tensorflow/core/kernels/cuda_device_array.h   |  2 +-
 tensorflow/core/kernels/cuda_solvers.cc       |  6 +-
 tensorflow/core/kernels/cuda_solvers.h        |  2 +-
 tensorflow/core/kernels/cudnn_pooling_gpu.cc  | 42 ++++++-------
 tensorflow/core/kernels/cudnn_pooling_gpu.h   |  4 +-
 tensorflow/core/kernels/cudnn_rnn_ops.cc      | 52 ++++++++--------
 .../core/kernels/depthwise_conv_op_gpu.cu.cc  |  3 +-
 .../kernels/dynamic_partition_op_gpu.cu.cc    |  4 +-
 tensorflow/core/kernels/fft_ops.cc            | 33 +++++-----
 .../core/kernels/fused_batch_norm_op.cc       | 22 +++----
 tensorflow/core/kernels/gpu_utils.h           |  8 +--
 tensorflow/core/kernels/lrn_op.cc             | 12 ++--
 tensorflow/core/kernels/matmul_op.cc          | 51 +++++++--------
 .../kernels/matrix_triangular_solve_op.cc     | 31 +++++-----
 tensorflow/core/kernels/maxpooling_op.cc      | 20 +++---
 tensorflow/core/kernels/pooling_ops_3d.cc     | 23 +++----
 tensorflow/core/kernels/pooling_ops_common.cc | 46 +++++++-------
 .../core/kernels/pooling_ops_common_gpu.h     |  4 +-
 .../core/kernels/segment_reduction_ops.cc     |  4 +-
 tensorflow/core/kernels/where_op.cc           |  5 +-
 .../platform/default/gpu/cupti_wrapper.cc     | 42 ++++++-------
 tensorflow/core/platform/types.h              |  4 +-
 69 files changed, 509 insertions(+), 600 deletions(-)

diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index 7c83387881..e17a7c4bf6 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -88,9 +88,8 @@ Status CompileGraph(const GraphDef& graph_def, const tf2xla::Config& config,
   // Converts the graph into an XLA computation, and compiles the
   // computation.
   // TODO(toddw): Should we let the user pick the XLA cpu vs. gpu client?
-  namespace gpu = perftools::gputools;
-  gpu::Platform* cpu_platform =
-      gpu::MultiPlatformManager::PlatformWithName("Host").ValueOrDie();
+  se::Platform* cpu_platform =
+      se::MultiPlatformManager::PlatformWithName("Host").ValueOrDie();
   xla::CompileOnlyClient* client =
       xla::ClientLibrary::GetOrCreateCompileOnlyClient(cpu_platform)
           .ValueOrDie();
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index f48941fce3..03ae09ee8b 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -37,8 +37,6 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
-namespace gpu = perftools::gputools;
-
 namespace tensorflow {
 
 XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx)
@@ -51,9 +49,9 @@ XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx)
   num_constant_args_ = constant_types.size();
   OP_REQUIRES_OK(ctx, ctx->GetAttr("Nresources", &num_resource_args_));
   if (device_type_ == DeviceType(DEVICE_CPU)) {
-    platform_id_ = gpu::host::kHostPlatformId;
+    platform_id_ = se::host::kHostPlatformId;
   } else if (device_type_ == DeviceType(DEVICE_GPU)) {
-    platform_id_ = gpu::cuda::kCudaPlatformId;
+    platform_id_ = se::cuda::kCudaPlatformId;
   } else {
     platform_id_ = nullptr;
   }
@@ -69,7 +67,7 @@ Status XlaLocalLaunchOp::BuildCompilationCache(OpKernelContext* ctx,
     return Status::OK();
   }
 
-  auto platform = gpu::MultiPlatformManager::PlatformWithId(platform_id_);
+  auto platform = se::MultiPlatformManager::PlatformWithId(platform_id_);
   if (!platform.ok()) {
     return StreamExecutorUtil::ConvertStatus(platform.status());
   }
@@ -100,7 +98,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   ResourceMgr* rm = ctx->resource_manager();
   OP_REQUIRES(ctx, rm, errors::Internal("No resource manager."));
 
-  gpu::Stream* stream =
+  se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
 
   XlaCompilationCache* cache;
@@ -153,7 +151,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   options.device_type = &cache->device_type();
   options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
   options.graph_def_version = ctx->function_library()->graph_def_version();
-  options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId);
+  options.allow_cpu_custom_calls = (platform_id_ == se::host::kHostPlatformId);
   options.device_allocator = xla_allocator;
   // TODO(b/77671268): We don't set variable_representation_shape_fn here. This
   // is restricted to Variables, but we need something like this to apply to
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.h b/tensorflow/compiler/jit/kernels/xla_launch_op.h
index c6cc0986af..8f8e646f0f 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.h
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.h
@@ -53,7 +53,7 @@ class XlaLocalLaunchOp : public OpKernel {
   // Number of resource variable arguments.
   int num_resource_args_;
 
-  perftools::gputools::Platform::Id platform_id_;
+  se::Platform::Id platform_id_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaLocalLaunchOp);
 };
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index 6c2782e28e..60458f6f33 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -58,7 +58,7 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
 
   launch_context.PopulateInputs(ctx, result, variables);
 
-  perftools::gputools::Stream* stream =
+  se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
   TF_RET_CHECK(stream);
 
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 2c2ac839b3..7beb18c04d 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -51,8 +51,6 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
-namespace se = ::perftools::gputools;
-
 namespace tensorflow {
 
 // Caches a XlaDeviceAllocator per <backend, device ordinal> pair. A
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index 2f5c53aea8..3ae87308cc 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -49,20 +49,20 @@ class XlaDevice : public LocalDevice {
   // retrieved e.g., when lazily creating the XlaCompilationCache device.
   class Metadata {
    public:
-    Metadata(int device_ordinal, perftools::gputools::Platform* platform,
+    Metadata(int device_ordinal, se::Platform* platform,
              const DeviceType& device_type);
 
     // The index of the device on this host.
     int device_ordinal() const;
 
-    perftools::gputools::Platform* platform() const;
+    se::Platform* platform() const;
     xla::LocalClient* client() const;
     const DeviceType& jit_device_type() const;
 
    private:
     const int device_ordinal_;
     const DeviceType device_type_;
-    perftools::gputools::Platform* platform_;  // Not owned.
+    se::Platform* platform_;  // Not owned.
 
     TF_DISALLOW_COPY_AND_ASSIGN(Metadata);
   };
@@ -85,8 +85,7 @@ class XlaDevice : public LocalDevice {
 
   XlaDevice(const SessionOptions& options, const DeviceAttributes& attrs,
             int device_ordinal, const DeviceType& jit_device_name,
-            ::perftools::gputools::Platform* platform,
-            bool transfer_as_literal);
+            se::Platform* platform, bool transfer_as_literal);
   ~XlaDevice() override;
 
   Allocator* GetAllocator(AllocatorAttributes attr) override;
@@ -103,7 +102,7 @@ class XlaDevice : public LocalDevice {
                              Tensor* tensor) override;
 
   xla::LocalClient* client() const;
-  xla::StatusOr<::perftools::gputools::Stream*> GetStream();
+  xla::StatusOr<se::Stream*> GetStream();
 
   // If not already set, create and set GpuDeviceInfo.
   // Not thread-safe
@@ -118,7 +117,7 @@ class XlaDevice : public LocalDevice {
   DeviceType jit_device_name_;
   // Memory allocator associated with this device.
   Allocator* xla_allocator_;                   // Not owned.
-  ::perftools::gputools::Platform* platform_;  // Not owned.
+  se::Platform* platform_;                     // Not owned.
   // Stream associated with this device. Operations enqueued on this
   // stream are executed on the device. Operations include data
   // copying back and forth between CPU and the device, and
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 43eb164012..bf8c1886a0 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -23,8 +23,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/platform/mem.h"
 
-namespace se = ::perftools::gputools;
-
 namespace tensorflow {
 
 // The allocator used for Tensors assigned to the XLA device.
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index ad914a1c23..d7f5f1d208 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -45,8 +45,7 @@ class XlaDeviceAllocator : public Allocator {
 // Helper class for managing data transfers between host and XLA devices.
 class XlaTransferManager {
  public:
-  explicit XlaTransferManager(perftools::gputools::Stream* stream,
-                              xla::LocalClient* client,
+  explicit XlaTransferManager(se::Stream* stream, xla::LocalClient* client,
                               bool transfer_as_literal);
 
   void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
@@ -54,7 +53,7 @@ class XlaTransferManager {
   void CopyDeviceTensorToCPU(const Tensor* device_tensor,
                              StringPiece tensor_name, Device* device,
                              Tensor* cpu_tensor, StatusCallback done);
-  perftools::gputools::Stream* stream() const { return stream_; }
+  se::Stream* stream() const { return stream_; }
 
  private:
   Status TransferLiteralToDevice(const Tensor& host_tensor,
@@ -64,7 +63,7 @@ class XlaTransferManager {
 
   // Stream obtained from a Device, used to transfer tensors between
   // CPU and device.
-  perftools::gputools::Stream* stream_;
+  se::Stream* stream_;
   // For the underlying memory allocator and XLA's TransferManager.
   xla::LocalClient* client_;
   // Transfer manager, for marshalling data to and from the device.
@@ -78,8 +77,8 @@ class XlaTransferManager {
 // wraps the methods in XlaTransferManager.
 class XlaDeviceContext : public DeviceContext {
  public:
-  explicit XlaDeviceContext(perftools::gputools::Stream* stream,
-                            xla::LocalClient* client, bool transfer_as_literal);
+  explicit XlaDeviceContext(se::Stream* stream, xla::LocalClient* client,
+                            bool transfer_as_literal);
 
   void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
                              Tensor* device_tensor,
@@ -87,9 +86,7 @@ class XlaDeviceContext : public DeviceContext {
   void CopyDeviceTensorToCPU(const Tensor* device_tensor,
                              StringPiece tensor_name, Device* device,
                              Tensor* cpu_tensor, StatusCallback done) override;
-  perftools::gputools::Stream* stream() const override {
-    return manager_.stream();
-  }
+  se::Stream* stream() const override { return manager_.stream(); }
 
  private:
   XlaTransferManager manager_;
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 3520501c1a..2a7f04271d 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -32,13 +32,12 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
+namespace tensorflow {
 namespace {
-namespace gpu = perftools::gputools;
 using xla::ScopedShapedBuffer;
 using xla::ShapedBuffer;
 }  // anonymous namespace
 
-namespace tensorflow {
 std::map<int, OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
                                                         int num_variables) {
   std::map<int, OptionalTensor> snapshot;
@@ -57,24 +56,23 @@ std::map<int, OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
   return snapshot;
 }
 
-XlaAllocator::XlaAllocator(const gpu::Platform* platform, Allocator* wrapped)
+XlaAllocator::XlaAllocator(const se::Platform* platform, Allocator* wrapped)
     : xla::DeviceMemoryAllocator(platform), wrapped_(wrapped) {}
 
 XlaAllocator::~XlaAllocator() {}
 
-xla::StatusOr<gpu::DeviceMemoryBase> XlaAllocator::Allocate(
+xla::StatusOr<se::DeviceMemoryBase> XlaAllocator::Allocate(
     int device_ordinal, uint64 size, bool retry_on_failure) {
   void* data = wrapped_->AllocateRaw(Allocator::kAllocatorAlignment, size);
   if (data == nullptr) {
     return errors::ResourceExhausted("Out of memory while trying to allocate ",
                                      size, " bytes.");
   } else {
-    return gpu::DeviceMemoryBase(data, size);
+    return se::DeviceMemoryBase(data, size);
   }
 }
 
-Status XlaAllocator::Deallocate(int device_ordinal,
-                                gpu::DeviceMemoryBase* mem) {
+Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase* mem) {
   wrapped_->DeallocateRaw(mem->opaque());
   return Status::OK();
 }
@@ -102,7 +100,7 @@ ScopedShapedBuffer ExtractSubShapedBuffer(
                                  /*target_base_index=*/{});
   for (auto& index_to_buffer : shape_tree) {
     if (!index_to_buffer.first.empty() && index_to_buffer.first[0] == index) {
-      index_to_buffer.second = gpu::DeviceMemoryBase(nullptr, 0);
+      index_to_buffer.second = se::DeviceMemoryBase(nullptr, 0);
     }
   }
   return ScopedShapedBuffer(std::move(sub_shaped_buffer), allocator);
@@ -149,7 +147,7 @@ void XlaComputationLaunchContext::PopulateInputs(
           << xla::ShapeUtil::HumanStringWithLayout(on_device_shape)
           << " not the same as on-host shape "
           << xla::ShapeUtil::HumanStringWithLayout(shape);
-      gpu::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t);
+      se::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t);
       arg_buffers_[i] = xla::MakeUnique<ShapedBuffer>(
           /*on_host_shape=*/shape, /*on_device_shape=*/shape,
           client_->platform(), client_->default_device_ordinal());
@@ -162,7 +160,7 @@ void XlaComputationLaunchContext::PopulateInputs(
 void XlaComputationLaunchContext::PopulateOutputs(
     OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel,
     ScopedShapedBuffer output) {
-  gpu::Stream* stream =
+  se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
 
   // Computation output should always be a tuple.
@@ -227,7 +225,7 @@ void XlaComputationLaunchContext::PopulateOutputs(
       const TensorShape& shape = kernel->outputs[i].shape;
       VLOG(2) << "Retval " << i << " shape " << shape.DebugString();
 
-      gpu::DeviceMemoryBase buffer = output.buffer({output_num});
+      se::DeviceMemoryBase buffer = output.buffer({output_num});
       if (allocate_xla_tensors_) {
         Tensor* output_tensor;
         OP_REQUIRES_OK(ctx, ctx->allocate_output(i, shape, &output_tensor));
@@ -238,7 +236,7 @@ void XlaComputationLaunchContext::PopulateOutputs(
       } else {
         Tensor output_tensor = XlaTensorBuffer::MakeTensor(
             ctx->expected_output_dtype(i), shape, buffer, allocator);
-        output.set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num});
+        output.set_buffer(se::DeviceMemoryBase(nullptr, 0), {output_num});
         ctx->set_output(i, output_tensor);
       }
       ++output_num;
@@ -258,7 +256,7 @@ void XlaComputationLaunchContext::PopulateOutputs(
                 write.input_index >= 0 && write.input_index < ctx->num_inputs(),
                 errors::Internal("Invalid input index for variable write."));
 
-    gpu::DeviceMemoryBase buffer = output.buffer({output_num});
+    se::DeviceMemoryBase buffer = output.buffer({output_num});
 
     Var* variable = nullptr;
     // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor,
@@ -288,7 +286,7 @@ void XlaComputationLaunchContext::PopulateOutputs(
     } else {
       Tensor output_tensor = XlaTensorBuffer::MakeTensor(
           write.type, write.shape, buffer, allocator);
-      output.set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num});
+      output.set_buffer(se::DeviceMemoryBase(nullptr, 0), {output_num});
       *variable->tensor() = output_tensor;
     }
     ++output_num;
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 26dcaa8a51..8a6ff3b0c7 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -46,13 +46,11 @@ std::map<int, OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
 // see comment on `AllowsAsynchronousDeallocation()`.
 class XlaAllocator : public xla::DeviceMemoryAllocator {
  public:
-  XlaAllocator(const perftools::gputools::Platform* platform,
-               Allocator* wrapped);
+  XlaAllocator(const se::Platform* platform, Allocator* wrapped);
   ~XlaAllocator() override;
-  xla::StatusOr<perftools::gputools::DeviceMemoryBase> Allocate(
-      int device_ordinal, uint64 size, bool retry_on_failure) override;
-  Status Deallocate(int device_ordinal,
-                    perftools::gputools::DeviceMemoryBase* mem) override;
+  xla::StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size,
+                                               bool retry_on_failure) override;
+  Status Deallocate(int device_ordinal, se::DeviceMemoryBase* mem) override;
 
   // The Tensorflow BFC allocator used on GPU allows host-side deallocation
   // before GPU execution takes place. Tensorflow uses the ordering of the main
@@ -126,8 +124,7 @@ class XlaTensorBuffer : public TensorBuffer {
   }
 
   static Tensor MakeTensor(DataType dtype, const TensorShape& shape,
-                           perftools::gputools::DeviceMemoryBase buffer,
-                           Allocator* allocator) {
+                           se::DeviceMemoryBase buffer, Allocator* allocator) {
     size_t expected_size = shape.num_elements() * DataTypeSize(dtype);
     auto* tensor_buffer = new XlaTensorBuffer(buffer.opaque(), expected_size,
                                               buffer.size(), allocator);
diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
index 84b2835c40..ce6456880b 100644
--- a/tensorflow/compiler/jit/xla_tensor.cc
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -31,16 +31,15 @@ namespace tensorflow {
   return FromTensor(const_cast<Tensor*>(tensor));
 }
 
-/*static*/ perftools::gputools::DeviceMemoryBase
-XlaTensor::DeviceMemoryFromTensor(const Tensor& tensor) {
+/*static*/ se::DeviceMemoryBase XlaTensor::DeviceMemoryFromTensor(
+    const Tensor& tensor) {
   const XlaTensor* xla_tensor = FromTensor(&tensor);
   if (xla_tensor) {
     CHECK(xla_tensor->has_shaped_buffer());
     return xla_tensor->shaped_buffer().root_buffer();
   } else {
-    return perftools::gputools::DeviceMemoryBase(
-        const_cast<char*>(tensor.tensor_data().data()),
-        tensor.tensor_data().size());
+    return se::DeviceMemoryBase(const_cast<char*>(tensor.tensor_data().data()),
+                                tensor.tensor_data().size());
   }
 }
 
diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h
index 2334fd272b..922a918973 100644
--- a/tensorflow/compiler/jit/xla_tensor.h
+++ b/tensorflow/compiler/jit/xla_tensor.h
@@ -43,8 +43,7 @@ class XlaTensor {
   // which case the returned value is shaped_buffer()->root_buffer(), or a
   // normal Tensor in which case the returned value is
   // {tensor.tensor_data().data(), tensor.tensor_data().size}.
-  static perftools::gputools::DeviceMemoryBase DeviceMemoryFromTensor(
-      const Tensor& tensor);
+  static se::DeviceMemoryBase DeviceMemoryFromTensor(const Tensor& tensor);
 
   // Assign the internal ShapedBuffer to new memory for the given dtype and
   // shape. If a ShapedBuffer exists already (has_shaped_buffer() == true), it
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index 1e8f011b5d..2458f7554a 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -247,7 +247,7 @@ class FusedConv2DBiasActivationOp : public OpKernel {
 };
 
 #if GOOGLE_CUDA
-namespace dnn = ::perftools::gputools::dnn;
+namespace dnn = se::dnn;
 
 // A dummy type to group forward convolution autotune results together.
 struct ConvBiasActivationAutoTuneGroup {
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
index b71ff9cd50..1be97ae3d6 100644
--- a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
@@ -59,7 +59,7 @@ void AdjustHsvInYiqGPU::operator()(OpKernelContext* ctx, int channel_count,
       delta_h, scale_s, scale_v, tranformation_matrix.flat<float>().data(),
       tranformation_matrix.flat<float>().size());
   // Call cuBlas C = A * B directly.
-  auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+  auto no_transpose = se::blas::Transpose::kNoTranspose;
   auto a_ptr =
       AsDeviceMemory(input->flat<float>().data(), input->flat<float>().size());
   auto b_ptr = AsDeviceMemory(tranformation_matrix.flat<float>().data(),
diff --git a/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
index 8dca90a1e3..ed22ee667f 100644
--- a/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
+++ b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
@@ -73,7 +73,7 @@ limitations under the License.
  */
 
 template <class T>
-using StatusOr = perftools::gputools::port::StatusOr<T>;
+using StatusOr = se::port::StatusOr<T>;
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
diff --git a/tensorflow/contrib/mpi_collectives/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/mpi_ops.cc
index a051ab0004..475297ca92 100644
--- a/tensorflow/contrib/mpi_collectives/mpi_ops.cc
+++ b/tensorflow/contrib/mpi_collectives/mpi_ops.cc
@@ -74,7 +74,7 @@ limitations under the License.
  */
 
 template <class T>
-using StatusOr = perftools::gputools::port::StatusOr<T>;
+using StatusOr = se::port::StatusOr<T>;
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.cc b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
index b9b482a698..b1cb89391c 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
@@ -24,7 +24,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
+using se::cuda::ScopedActivateExecutorContext;
 
 // Contains data for a single stream used for nccl communication; this includes
 // a background thread that calls NcclManager::LoopKernelLaunches.
@@ -37,11 +37,11 @@ struct NcclManager::NcclStream {
     cv.notify_all();
   }
 
-  perftools::gputools::StreamExecutor* executor = nullptr;
+  se::StreamExecutor* executor = nullptr;
 
   // The stream on which to run the nccl collective.
   // This is a different stream than the tensorflow compute stream.
-  std::unique_ptr<perftools::gputools::Stream> stream;
+  std::unique_ptr<se::Stream> stream;
 
   // See NcclManager::LoopKernelLaunches for information on these.
   std::unique_ptr<Thread> thread;
@@ -95,9 +95,8 @@ ncclDataType_t ToNcclType(DataType t) {
 // A participant in a Collective.  See <Collective> below.
 struct NcclManager::Participant {
   Participant(const Tensor* in_t, Tensor* out_t, EventMgr* event_mgr,
-              perftools::gputools::Stream* tensor_stream,
-              perftools::gputools::StreamExecutor* executor, int gpu_device_id,
-              NcclManager::DoneCallback done_callback)
+              se::Stream* tensor_stream, se::StreamExecutor* executor,
+              int gpu_device_id, NcclManager::DoneCallback done_callback)
       : in_t(in_t),
         out_t(out_t),
         event_mgr(event_mgr),
@@ -121,11 +120,11 @@ struct NcclManager::Participant {
   EventMgr* const event_mgr;
 
   // Owned by the caller, who must keep it live until <done_callback> is called.
-  perftools::gputools::Stream* const tensor_stream;
+  se::Stream* const tensor_stream;
 
   // Matches the executor in CommunicatorMember::stream. Expected to be live for
   // process lifetime.
-  perftools::gputools::StreamExecutor* const executor = nullptr;
+  se::StreamExecutor* const executor = nullptr;
 
   const int gpu_device_id;
 
@@ -245,7 +244,7 @@ NcclManager::Communicator* NcclManager::GetCommunicator(
     if (nccl_stream == nullptr) {
       nccl_stream = new NcclStream();
       nccl_stream->executor = executor;
-      nccl_stream->stream.reset(new perftools::gputools::Stream(executor));
+      nccl_stream->stream.reset(new se::Stream(executor));
       nccl_stream->stream->Init();
 
       streams.emplace_back(nccl_stream);
@@ -300,10 +299,10 @@ NcclManager::Communicator* NcclManager::GetCommunicator(
 
 void NcclManager::AddToAllReduce(int num_devices, const string& key,
                                  ncclRedOp_t reduction_op,
-                                 perftools::gputools::StreamExecutor* executor,
+                                 se::StreamExecutor* executor,
                                  int gpu_device_id, EventMgr* event_mgr,
-                                 perftools::gputools::Stream* tensor_stream,
-                                 const Tensor* in_t, Tensor* out_t,
+                                 se::Stream* tensor_stream, const Tensor* in_t,
+                                 Tensor* out_t,
                                  const DoneCallback& done_callback) {
   std::unique_ptr<Participant> participant(
       new Participant(in_t, out_t, event_mgr, tensor_stream, executor,
@@ -312,11 +311,12 @@ void NcclManager::AddToAllReduce(int num_devices, const string& key,
                  kAllReduce, reduction_op);
 }
 
-void NcclManager::AddBroadcastSend(
-    int num_devices, const string& key,
-    perftools::gputools::StreamExecutor* executor, int gpu_device_id,
-    EventMgr* event_mgr, perftools::gputools::Stream* tensor_stream,
-    const Tensor* in_t, DoneCallback done_callback) {
+void NcclManager::AddBroadcastSend(int num_devices, const string& key,
+                                   se::StreamExecutor* executor,
+                                   int gpu_device_id, EventMgr* event_mgr,
+                                   se::Stream* tensor_stream,
+                                   const Tensor* in_t,
+                                   DoneCallback done_callback) {
   std::unique_ptr<Participant> participant(
       new Participant(in_t, nullptr /* out_t */, event_mgr, tensor_stream,
                       executor, gpu_device_id, std::move(done_callback)));
@@ -325,11 +325,11 @@ void NcclManager::AddBroadcastSend(
                  kBroadcast, ncclSum /* unused */);
 }
 
-void NcclManager::AddBroadcastRecv(
-    int num_devices, const string& key,
-    perftools::gputools::StreamExecutor* executor, int gpu_device_id,
-    EventMgr* event_mgr, perftools::gputools::Stream* tensor_stream,
-    Tensor* out_t, DoneCallback done_callback) {
+void NcclManager::AddBroadcastRecv(int num_devices, const string& key,
+                                   se::StreamExecutor* executor,
+                                   int gpu_device_id, EventMgr* event_mgr,
+                                   se::Stream* tensor_stream, Tensor* out_t,
+                                   DoneCallback done_callback) {
   std::unique_ptr<Participant> participant(
       new Participant(nullptr /* in_t */, out_t, event_mgr, tensor_stream,
                       executor, gpu_device_id, std::move(done_callback)));
@@ -339,9 +339,8 @@ void NcclManager::AddBroadcastRecv(
 
 void NcclManager::AddReduceSend(int num_devices, const string& key,
                                 ncclRedOp_t reduction_op,
-                                perftools::gputools::StreamExecutor* executor,
-                                int gpu_device_id, EventMgr* event_mgr,
-                                perftools::gputools::Stream* tensor_stream,
+                                se::StreamExecutor* executor, int gpu_device_id,
+                                EventMgr* event_mgr, se::Stream* tensor_stream,
                                 const Tensor* in_t,
                                 DoneCallback done_callback) {
   std::unique_ptr<Participant> participant(
@@ -353,9 +352,8 @@ void NcclManager::AddReduceSend(int num_devices, const string& key,
 
 void NcclManager::AddReduceRecv(int num_devices, const string& key,
                                 ncclRedOp_t reduction_op,
-                                perftools::gputools::StreamExecutor* executor,
-                                int gpu_device_id, EventMgr* event_mgr,
-                                perftools::gputools::Stream* tensor_stream,
+                                se::StreamExecutor* executor, int gpu_device_id,
+                                EventMgr* event_mgr, se::Stream* tensor_stream,
                                 const Tensor* in_t, Tensor* out_t,
                                 DoneCallback done_callback) {
   std::unique_ptr<Participant> participant(
@@ -444,7 +442,7 @@ void NcclManager::RunCollective(const string& key, Collective* collective) {
 }
 
 void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
-  perftools::gputools::Stream* comm_stream = nccl_stream->stream.get();
+  se::Stream* comm_stream = nccl_stream->stream.get();
   ScopedActivateExecutorContext scoped_context(nccl_stream->executor);
   const cudaStream_t* cu_stream = reinterpret_cast<const cudaStream_t*>(
       comm_stream->implementation()->CudaStreamMemberHack());
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.h b/tensorflow/contrib/nccl/kernels/nccl_manager.h
index 6ff8cea84e..57a96c5d33 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.h
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.h
@@ -55,41 +55,34 @@ class NcclManager {
   // is also the stream that will use the produced data; <done_callback> is
   // not called until the next kernel launched on <stream> would see the data.
   void AddToAllReduce(int num_devices, const string& key,
-                      ncclRedOp_t reduction_op,
-                      perftools::gputools::StreamExecutor* executor,
+                      ncclRedOp_t reduction_op, se::StreamExecutor* executor,
                       int gpu_device_id, EventMgr* event_mgr,
-                      perftools::gputools::Stream* tensor_stream,
-                      const Tensor* in_t, Tensor* out_t,
-                      const DoneCallback& done_callback);
+                      se::Stream* tensor_stream, const Tensor* in_t,
+                      Tensor* out_t, const DoneCallback& done_callback);
 
   // AddBroadcastSend and AddBroadcastRecv combine to sent data from one sender
   // to all receivers.
   void AddBroadcastSend(int num_devices, const string& key,
-                        perftools::gputools::StreamExecutor* executor,
-                        int gpu_device_id, EventMgr* event_mgr,
-                        perftools::gputools::Stream* tensor_stream,
+                        se::StreamExecutor* executor, int gpu_device_id,
+                        EventMgr* event_mgr, se::Stream* tensor_stream,
                         const Tensor* in_t, DoneCallback done_callback);
   void AddBroadcastRecv(int num_devices, const string& key,
-                        perftools::gputools::StreamExecutor* executor,
-                        int gpu_device_id, EventMgr* event_mgr,
-                        perftools::gputools::Stream* tensor_stream,
+                        se::StreamExecutor* executor, int gpu_device_id,
+                        EventMgr* event_mgr, se::Stream* tensor_stream,
                         Tensor* out_t, DoneCallback done_callback);
 
   // AddReduceSend and AddReduceRecv combine to sent data from all senders
   // to one receiver.
   void AddReduceSend(int num_devices, const string& key,
-                     ncclRedOp_t reduction_op,
-                     perftools::gputools::StreamExecutor* executor,
+                     ncclRedOp_t reduction_op, se::StreamExecutor* executor,
                      int gpu_device_id, EventMgr* event_mgr,
-                     perftools::gputools::Stream* tensor_stream,
-                     const Tensor* in_t, DoneCallback done_callback);
+                     se::Stream* tensor_stream, const Tensor* in_t,
+                     DoneCallback done_callback);
   void AddReduceRecv(int num_devices, const string& key,
-                     ncclRedOp_t reduction_op,
-                     perftools::gputools::StreamExecutor* executor,
+                     ncclRedOp_t reduction_op, se::StreamExecutor* executor,
                      int gpu_device_id, EventMgr* event_mgr,
-                     perftools::gputools::Stream* tensor_stream,
-                     const Tensor* in_t, Tensor* out_t,
-                     DoneCallback done_callback);
+                     se::Stream* tensor_stream, const Tensor* in_t,
+                     Tensor* out_t, DoneCallback done_callback);
 
  private:
   enum CollectiveType {
@@ -123,8 +116,7 @@ class NcclManager {
   // Maps a device to the communication streams that make up its collective.
   // This is used to share the stream across different communicators that
   // include the same device.
-  std::map<perftools::gputools::StreamExecutor*,
-           std::vector<std::unique_ptr<NcclStream>>>
+  std::map<se::StreamExecutor*, std::vector<std::unique_ptr<NcclStream>>>
       device_to_comm_streams_ GUARDED_BY(mu_);
 
   std::vector<std::unique_ptr<Communicator>> communicators_;
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
index 06ca65e33a..4d8d922cb4 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
@@ -175,11 +175,9 @@ class NcclManagerTest : public ::testing::Test {
                                     nullptr /* step_resource_manager */);
   }
 
-  static perftools::gputools::DeviceMemory<Scalar> AsDeviceMemory(
-      const Scalar* cuda_memory) {
-    perftools::gputools::DeviceMemoryBase wrapped(
-        const_cast<Scalar*>(cuda_memory));
-    perftools::gputools::DeviceMemory<Scalar> typed(wrapped);
+  static se::DeviceMemory<Scalar> AsDeviceMemory(const Scalar* cuda_memory) {
+    se::DeviceMemoryBase wrapped(const_cast<Scalar*>(cuda_memory));
+    se::DeviceMemory<Scalar> typed(wrapped);
     return typed;
   }
 
diff --git a/tensorflow/contrib/rnn/kernels/blas_gemm.cc b/tensorflow/contrib/rnn/kernels/blas_gemm.cc
index 03006dab32..45d22b739b 100644
--- a/tensorflow/contrib/rnn/kernels/blas_gemm.cc
+++ b/tensorflow/contrib/rnn/kernels/blas_gemm.cc
@@ -26,9 +26,9 @@ namespace tensorflow {
 #if GOOGLE_CUDA
 namespace {
 template <typename T>
-perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
-  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
-  perftools::gputools::DeviceMemory<T> typed(wrapped);
+se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
+  se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
+  se::DeviceMemory<T> typed(wrapped);
   return typed;
 }
 }  // namespace
@@ -41,9 +41,8 @@ void TensorCuBlasGemm<T>::operator()(OpKernelContext* ctx, bool transa,
                                      T alpha, const T* a, int lda, const T* b,
                                      int ldb, T beta, T* c, int ldc) {
 #if GOOGLE_CUDA
-  perftools::gputools::blas::Transpose trans[] = {
-      perftools::gputools::blas::Transpose::kNoTranspose,
-      perftools::gputools::blas::Transpose::kTranspose};
+  se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose,
+                                 se::blas::Transpose::kTranspose};
 
   auto a_ptr = AsDeviceMemory(a);
   auto b_ptr = AsDeviceMemory(b);
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index b32371b642..53ba7badca 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -25,7 +25,6 @@ limitations under the License.
 
 namespace tensorflow {
 static ::tensorflow::tensorrt::Logger logger;
-namespace gpu = ::perftools::gputools;
 using IRuntime = nvinfer1::IRuntime;
 using Dims = nvinfer1::Dims;
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
index c2c0b020c7..ad142e9982 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
@@ -29,8 +29,6 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
-namespace gpu = ::perftools::gputools;
-
 namespace tensorflow {
 
 // A GPU memory allocator that implements a 'best-fit with coalescing'
@@ -52,7 +50,7 @@ class GPUBFCAllocator : public BFCAllocator {
 class GPUMemAllocator : public SubAllocator {
  public:
   // Note: stream_exec cannot be null.
-  explicit GPUMemAllocator(perftools::gputools::StreamExecutor* stream_exec)
+  explicit GPUMemAllocator(se::StreamExecutor* stream_exec)
       : stream_exec_(stream_exec) {
     CHECK(stream_exec_ != nullptr);
   }
@@ -68,13 +66,13 @@ class GPUMemAllocator : public SubAllocator {
 
   void Free(void* ptr, size_t num_bytes) override {
     if (ptr != nullptr) {
-      gpu::DeviceMemoryBase gpu_ptr(ptr);
+      se::DeviceMemoryBase gpu_ptr(ptr);
       stream_exec_->Deallocate(&gpu_ptr);
     }
   }
 
  private:
-  perftools::gputools::StreamExecutor* stream_exec_;  // not owned, non-null
+  se::StreamExecutor* stream_exec_;  // not owned, non-null
 
   TF_DISALLOW_COPY_AND_ASSIGN(GPUMemAllocator);
 };
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
index 208697361d..5043fac797 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
@@ -44,7 +44,7 @@ class GPUcudaMallocAllocator : public VisitableAllocator {
  private:
   VisitableAllocator* base_allocator_ = nullptr;  // owned
 
-  perftools::gputools::StreamExecutor* stream_exec_;  // Not owned.
+  se::StreamExecutor* stream_exec_;  // Not owned.
 
   TF_DISALLOW_COPY_AND_ASSIGN(GPUcudaMallocAllocator);
 };
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index b0ca7e3109..4ff5fab866 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -40,8 +40,7 @@ int64* NewMask(int64 word) {
 int64* before_mask = NewMask(0xabababababababab);
 int64* after_mask = NewMask(0xcdcdcdcdcdcdcdcd);
 
-bool CheckMask(perftools::gputools::StreamExecutor* exec, void* ptr,
-               int64* mask) {
+bool CheckMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
   gpu::DeviceMemory<int64> gpu_ptr{gpu::DeviceMemoryBase{ptr, MASK_BYTES}};
   int64 tmp[MASK_WORDS];
 
@@ -62,8 +61,7 @@ bool CheckMask(perftools::gputools::StreamExecutor* exec, void* ptr,
   return ok;
 }
 
-void InitMask(perftools::gputools::StreamExecutor* exec, void* ptr,
-              int64* mask) {
+void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
   gpu::DeviceMemory<int64> gpu_ptr{gpu::DeviceMemoryBase{ptr, MASK_BYTES}};
   if (!exec->SynchronousMemcpy(&gpu_ptr, mask, MASK_BYTES)) {
     LOG(FATAL) << "Could not copy debug mask";
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
index adce3a8436..c49ec2a566 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
@@ -55,7 +55,7 @@ class GPUDebugAllocator : public VisitableAllocator {
  private:
   VisitableAllocator* base_allocator_ = nullptr;  // owned
 
-  perftools::gputools::StreamExecutor* stream_exec_;  // Not owned.
+  se::StreamExecutor* stream_exec_;  // Not owned.
 
   TF_DISALLOW_COPY_AND_ASSIGN(GPUDebugAllocator);
 };
@@ -81,7 +81,7 @@ class GPUNanResetAllocator : public VisitableAllocator {
  private:
   VisitableAllocator* base_allocator_ = nullptr;  // owned
 
-  perftools::gputools::StreamExecutor* stream_exec_;  // Not owned.
+  se::StreamExecutor* stream_exec_;  // Not owned.
 
   TF_DISALLOW_COPY_AND_ASSIGN(GPUNanResetAllocator);
 };
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 0b9e8f9cc2..f7248ca79d 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -297,9 +297,8 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
     }
     scratch_.push_back(static_cast<char*>(scratch_buffer));
 
-    perftools::gputools::DeviceMemory<char> mem(
-        perftools::gputools::DeviceMemoryBase(scratch_buffer,
-                                              scratch_buffer_size));
+    se::DeviceMemory<char> mem(
+        se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size));
 
     bool ok = executor_->SynchronousMemZero(
         &mem, Eigen::kCudaScratchSize + sizeof(unsigned int));
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
index af6a59a85d..4898448476 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
@@ -18,11 +18,9 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
-namespace gpu = ::perftools::gputools;
-
 namespace tensorflow {
 
-EventMgr::EventMgr(gpu::StreamExecutor* se, const GPUOptions& gpu_options)
+EventMgr::EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options)
     : exec_(se),
       deferred_bytes_threshold_(gpu_options.deferred_deletion_bytes()
                                     ? gpu_options.deferred_deletion_bytes()
@@ -94,7 +92,7 @@ void EventMgr::StopPollingLoop() {
   }
 }
 
-void EventMgr::ThenDeleteTensors(perftools::gputools::Stream* stream,
+void EventMgr::ThenDeleteTensors(se::Stream* stream,
                                  const TensorReferenceVector& tensors) {
   mutex_lock l(mu_);
   // TODO(jeff): We currently keep one accumulated_tensors_ object.
@@ -152,16 +150,16 @@ void EventMgr::PollLoop() {
   polling_stopped_->Notify();
 }
 
-void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu) {
+void EventMgr::QueueInUse(se::Stream* stream, InUse iu) {
   VLOG(2) << "QueueInUse  free_events_ " << free_events_.size()
           << " used_events_ " << used_events_.size();
   // Events are created on demand, and repeatedly reused.  There is no
   // limit placed here on the number of allocated Events.
   if (free_events_.empty()) {
-    free_events_.push_back(new gpu::Event(exec_));
+    free_events_.push_back(new se::Event(exec_));
     free_events_.back()->Init();
   }
-  gpu::Event* e = free_events_.back();
+  se::Event* e = free_events_.back();
   free_events_.pop_back();
   stream->ThenRecordEvent(e);
   iu.event = e;
@@ -199,18 +197,18 @@ void EventMgr::PollEvents(bool is_dedicated_poller,
   // the first non-complete record that is still pending.
   for (auto& iu : used_events_) {
     if (iu.event == nullptr) continue;
-    gpu::Event::Status s = iu.event->PollForStatus();
+    se::Event::Status s = iu.event->PollForStatus();
     switch (s) {
-      case gpu::Event::Status::kUnknown:
-      case gpu::Event::Status::kError:
+      case se::Event::Status::kUnknown:
+      case se::Event::Status::kError:
         // We don't expect to see these.  Someday maybe propagate
         // a Status error, but for now fail hard.
         LOG(FATAL) << "Unexpected Event status: " << static_cast<int>(s);
         break;
-      case gpu::Event::Status::kPending:
+      case se::Event::Status::kPending:
         if (!is_dedicated_poller) return;  // quit processing queue
         break;
-      case gpu::Event::Status::kComplete:
+      case se::Event::Status::kComplete:
         // Make a copy of the InUse record so we can free it after releasing
         // the lock
         to_free->push_back(iu);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
index fd5f50ca4e..b26f88a201 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
@@ -44,14 +44,13 @@ class GPUOptions;
 // Events are recorded.
 class EventMgr {
  public:
-  EventMgr(perftools::gputools::StreamExecutor* se,
-           const GPUOptions& gpu_options);
+  EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options);
 
   ~EventMgr();
 
   // Releases the references on the elements of "tensors" as soon as
   // all events currently enqueued on "stream" have completed.
-  void ThenDeleteTensors(perftools::gputools::Stream* stream,
+  void ThenDeleteTensors(se::Stream* stream,
                          const TensorReferenceVector& tensors);
 
   struct BufRec {
@@ -65,8 +64,7 @@ class EventMgr {
 
   // Takes ownership of *bufrec.buf and calls bufrec.alloc->DeallocateRaw()
   // on it as soon as all events currently enqueued on *stream have completed.
-  inline void ThenDeleteBuffer(perftools::gputools::Stream* stream,
-                               BufRec bufrec) {
+  inline void ThenDeleteBuffer(se::Stream* stream, BufRec bufrec) {
     ToFreeVector to_free;
     {
       mutex_lock l(mu_);
@@ -76,8 +74,7 @@ class EventMgr {
     FreeMemory(to_free);
   }
 
-  inline void ThenExecute(perftools::gputools::Stream* stream,
-                          std::function<void()> func) {
+  inline void ThenExecute(se::Stream* stream, std::function<void()> func) {
     ToFreeVector to_free;
     {
       mutex_lock l(mu_);
@@ -89,7 +86,7 @@ class EventMgr {
 
  private:
   friend class TEST_EventMgrHelper;
-  perftools::gputools::StreamExecutor* const exec_;
+  se::StreamExecutor* const exec_;
   const int64 deferred_bytes_threshold_;
   const int32 polling_active_delay_usecs_;
   mutex mu_;
@@ -98,7 +95,7 @@ class EventMgr {
   void FlushAccumulatedTensors() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   struct InUse {
-    perftools::gputools::Event* event;
+    se::Event* event;
     TensorReferenceVector* mem;
     BufRec bufrec;
     std::function<void()> func;
@@ -130,22 +127,21 @@ class EventMgr {
   // Stream-enqueue an unused Event and save with it a collection of
   // Tensors and/or a BufRec to be deleted only after the Event
   // records.
-  void QueueInUse(perftools::gputools::Stream* stream, InUse in_use)
+  void QueueInUse(se::Stream* stream, InUse in_use)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  void QueueTensors(perftools::gputools::Stream* stream,
-                    TensorReferenceVector* tensors)
+  void QueueTensors(se::Stream* stream, TensorReferenceVector* tensors)
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     QueueInUse(stream, {nullptr, tensors, BufRec(), nullptr});
   }
 
-  void QueueBuffer(perftools::gputools::Stream* stream, BufRec bufrec)
+  void QueueBuffer(se::Stream* stream, BufRec bufrec)
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     QueueInUse(stream, {nullptr, nullptr, bufrec, nullptr});
   }
 
-  void QueueFunc(perftools::gputools::Stream* stream,
-                 std::function<void()> func) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  void QueueFunc(se::Stream* stream, std::function<void()> func)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     QueueInUse(stream, {nullptr, nullptr, BufRec(), std::move(func)});
   }
 
@@ -166,10 +162,10 @@ class EventMgr {
   void StopPollingLoop();
 
   // A stack of unused events
-  std::vector<perftools::gputools::Event*> free_events_ GUARDED_BY(mu_);
+  std::vector<se::Event*> free_events_ GUARDED_BY(mu_);
 
   // Buffered list of tensors waiting to have an event queued for deletion
-  perftools::gputools::Stream* accumulated_stream_ GUARDED_BY(mu_);
+  se::Stream* accumulated_stream_ GUARDED_BY(mu_);
   TensorReferenceVector* accumulated_tensors_ GUARDED_BY(mu_);
   // Sum of the TotalBytes() of the tensors in "accumulated_tensors_"
   int64 accumulated_tensor_bytes_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
index 3ad0b0eb85..1d4ad957b9 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
@@ -23,8 +23,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
-namespace gpu = ::perftools::gputools;
-
 namespace tensorflow {
 
 class TEST_EventMgrHelper {
@@ -47,8 +45,7 @@ class TEST_EventMgrHelper {
     return em_->free_events_.size();
   }
 
-  void QueueTensors(perftools::gputools::Stream* stream,
-                    TensorReferenceVector* tensors) {
+  void QueueTensors(se::Stream* stream, TensorReferenceVector* tensors) {
     mutex_lock l(em_->mu_);
     em_->QueueTensors(stream, tensors);
   }
@@ -121,7 +118,7 @@ TEST(EventMgr, DelayedPolling) {
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, th.queue_size());
   TensorReferenceVector* v = nullptr;
-  std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
+  std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
   CHECK(stream.get());
   stream->Init();
   for (int i = 0; i < 5; ++i) {
@@ -153,7 +150,7 @@ TEST(EventMgr, FlushLargeTensorImmediately) {
   EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
-  std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
+  std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
   CHECK(stream.get());
   stream->Init();
   for (int i = 0; i < 5; ++i) {
@@ -170,7 +167,7 @@ TEST(EventMgr, ManySmallTensorsFlushedImmediately) {
   EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
-  std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
+  std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
   CHECK(stream.get());
   stream->Init();
   for (int i = 0; i < 5; ++i) {
@@ -189,8 +186,8 @@ TEST(EventMgr, StreamSwitchingFlushesImmediately) {
   EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
-  std::unique_ptr<gpu::Stream> stream1(new gpu::Stream(stream_exec));
-  std::unique_ptr<gpu::Stream> stream2(new gpu::Stream(stream_exec));
+  std::unique_ptr<se::Stream> stream1(new se::Stream(stream_exec));
+  std::unique_ptr<se::Stream> stream2(new se::Stream(stream_exec));
   stream1->Init();
   stream2->Init();
   TensorReferenceVector v1;
@@ -211,7 +208,7 @@ TEST(EventMgr, ManySmallTensorsSeparateCallsFlushed) {
   EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
-  std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
+  std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
   CHECK(stream.get());
   stream->Init();
   for (int i = 0; i < 5; ++i) {
@@ -234,7 +231,7 @@ TEST(EventMgr, NonEmptyShutdown) {
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, th.queue_size());
   EXPECT_EQ(0, th.free_size());
-  std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
+  std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
   CHECK(stream.get());
   stream->Init();
   for (int i = 0; i < 5; ++i) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.cc b/tensorflow/core/common_runtime/gpu/gpu_init.cc
index aa23e3cc61..ff96891a2a 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_init.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_init.cc
@@ -26,12 +26,10 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
-namespace gpu = ::perftools::gputools;
-
 namespace tensorflow {
 
 Status ValidateGPUMachineManager() {
-  auto result = gpu::MultiPlatformManager::PlatformWithName("CUDA");
+  auto result = se::MultiPlatformManager::PlatformWithName("CUDA");
   if (!result.ok()) {
     return StreamExecutorUtil::ConvertStatus(result.status());
   }
@@ -39,8 +37,8 @@ Status ValidateGPUMachineManager() {
   return Status::OK();
 }
 
-gpu::Platform* GPUMachineManager() {
-  auto result = gpu::MultiPlatformManager::PlatformWithName("CUDA");
+se::Platform* GPUMachineManager() {
+  auto result = se::MultiPlatformManager::PlatformWithName("CUDA");
   if (!result.ok()) {
     LOG(FATAL) << "Could not find Platform with name CUDA";
     return nullptr;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 5214ceaae5..7ba853fa51 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -55,19 +55,15 @@ limitations under the License.
 const tensorflow::int64 FLAGS_brain_gpu_util_debug_string_maxlen = 128;
 extern bool FLAGS_brain_gpu_record_mem_types;
 
-using perftools::gputools::DeviceMemoryBase;
-using perftools::gputools::Stream;
-
 namespace tensorflow {
 
-// TODO(b/77980417): Remove this and use the regular tensorflow::se alias once
-// that's available.
-namespace gpu = ::stream_executor;
+using se::DeviceMemoryBase;
+using se::Stream;
 
 Status PrepareCopy(Device* device, const DeviceContext* ctx, const Tensor& src,
                    const Tensor* dst,
                    const DeviceBase::GpuDeviceInfo** dev_info,
-                   gpu::Stream** stream) {
+                   se::Stream** stream) {
   if (device == nullptr) {
     return errors::Internal("Unexpected null device.");
   }
@@ -122,7 +118,7 @@ void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
                               StatusCallback done) {
   VLOG(1) << "SetProtoFromGPU device_context " << device_context;
   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
-  gpu::Stream* send_stream = nullptr;
+  se::Stream* send_stream = nullptr;
   Status s = PrepareCopy(dev, device_context, tensor, nullptr, &dev_info,
                          &send_stream);
   if (!s.ok()) {
@@ -197,7 +193,7 @@ void GPUUtil::DeviceToDeviceCopy(DeviceContext* send_dev_context,
                                  const Tensor* input, Tensor* output,
                                  StatusCallback done) {
   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
-  gpu::Stream* send_stream = nullptr;
+  se::Stream* send_stream = nullptr;
   Status s = PrepareCopy(src, send_dev_context, *input, output, &dev_info,
                          &send_stream);
   if (!s.ok()) {
@@ -264,7 +260,7 @@ void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device,
                                  StatusCallback done) {
   VLOG(1) << "CopyGPUTensorToCPU";
   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
-  gpu::Stream* send_stream = nullptr;
+  se::Stream* send_stream = nullptr;
   Status s = PrepareCopy(gpu_device, device_context, *gpu_tensor, cpu_tensor,
                          &dev_info, &send_stream);
   if (!s.ok()) {
@@ -309,7 +305,7 @@ void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
                                  StatusCallback done) {
   VLOG(1) << "CopyCPUTensorToGPU";
   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
-  gpu::Stream* recv_stream = nullptr;
+  se::Stream* recv_stream = nullptr;
   Status s = PrepareCopy(gpu_device, device_context, *cpu_tensor, gpu_tensor,
                          &dev_info, &recv_stream);
   if (!s.ok()) {
@@ -432,7 +428,7 @@ void GPUUtil::CopyGPUTensorToSameGPU(Device* gpu_device,
                                      StatusCallback done) {
   VLOG(1) << "CopyGPUTensorToSameGPU";
   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
-  gpu::Stream* send_stream = nullptr;
+  se::Stream* send_stream = nullptr;
   Status s = PrepareCopy(gpu_device, device_context, *src_gpu_tensor,
                          dst_gpu_tensor, &dev_info, &send_stream);
   if (!s.ok()) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.h b/tensorflow/core/common_runtime/gpu/gpu_util.h
index 337dc89895..0c69a17eaa 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.h
@@ -74,10 +74,9 @@ class GPUUtil {
   // NOTE: will be removed soon, see StreamExecutorUtil::AsDeviceMemory
   // instead.
   template <typename T>
-  static perftools::gputools::DeviceMemory<T> AsDeviceMemory(const Tensor& t) {
+  static se::DeviceMemory<T> AsDeviceMemory(const Tensor& t) {
     T* ptr = reinterpret_cast<T*>(const_cast<void*>(DMAHelper::base(&t)));
-    return perftools::gputools::DeviceMemory<T>(
-        perftools::gputools::DeviceMemoryBase(ptr, t.TotalBytes()));
+    return se::DeviceMemory<T>(se::DeviceMemoryBase(ptr, t.TotalBytes()));
   }
 
   // Computes a checksum over the contents of "tensor", which is allocated
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.h b/tensorflow/core/common_runtime/gpu/pool_allocator.h
index 91ce830df8..310158aba1 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h
@@ -181,7 +181,7 @@ class BasicCPUAllocator : public SubAllocator {
 class CUDAHostAllocator : public SubAllocator {
  public:
   // Note: stream_exec cannot be null.
-  explicit CUDAHostAllocator(perftools::gputools::StreamExecutor* stream_exec)
+  explicit CUDAHostAllocator(se::StreamExecutor* stream_exec)
       : stream_exec_(stream_exec) {
     CHECK(stream_exec_ != nullptr);
   }
@@ -206,7 +206,7 @@ class CUDAHostAllocator : public SubAllocator {
   }
 
  private:
-  perftools::gputools::StreamExecutor* stream_exec_;  // not owned, non-null
+  se::StreamExecutor* stream_exec_;  // not owned, non-null
 
   TF_DISALLOW_COPY_AND_ASSIGN(CUDAHostAllocator);
 };
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
index 85555955e3..a4c8d5fe86 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
@@ -20,18 +20,16 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/test.h"
 
-namespace gpu = ::perftools::gputools;
-
 namespace tensorflow {
 namespace {
 
 TEST(PoolAllocatorTest, ZeroSizeBuffers) {
-  gpu::Platform* platform =
-      gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       2 /*pool_size_limit*/, false /*auto_resize*/,
       new CUDAHostAllocator(
-          platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0))
+          platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie()),
       new NoopRounder, "pool");
 
@@ -44,12 +42,12 @@ TEST(PoolAllocatorTest, ZeroSizeBuffers) {
 }
 
 TEST(PoolAllocatorTest, ZeroSizePool) {
-  gpu::Platform* platform =
-      gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       0 /*pool_size_limit*/, false /*auto_resize*/,
       new CUDAHostAllocator(
-          platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0))
+          platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie()),
       new NoopRounder, "pool");
 
@@ -77,12 +75,12 @@ TEST(PoolAllocatorTest, ZeroSizePool) {
 }
 
 TEST(PoolAllocatorTest, Alignment) {
-  gpu::Platform* platform =
-      gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       0 /*pool_size_limit*/, false /*auto_resize*/,
       new CUDAHostAllocator(
-          platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0))
+          platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie()),
       new NoopRounder, "pool");
   for (int i = 0; i < 16; ++i) {
@@ -123,12 +121,12 @@ TEST(PoolAllocatorTest, AutoResize) {
 }
 
 TEST(PoolAllocatorTest, CudaHostAllocator) {
-  gpu::Platform* platform =
-      gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       2 /*pool_size_limit*/, false /*auto_resize*/,
       new CUDAHostAllocator(
-          platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0))
+          platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie()),
       new NoopRounder, "pool");
 
@@ -200,12 +198,12 @@ TEST(PoolAllocatorTest, Pow2Rounder) {
 }
 
 TEST(PoolAllocatorTest, Name) {
-  gpu::Platform* platform =
-      gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       2 /*pool_size_limit*/, false /*auto_resize*/,
       new CUDAHostAllocator(
-          platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0))
+          platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie()),
       new NoopRounder, "pool");
   EXPECT_EQ("pool", pool.Name());
diff --git a/tensorflow/core/common_runtime/gpu_device_context.h b/tensorflow/core/common_runtime/gpu_device_context.h
index 38a18cd087..a1ad2c2277 100644
--- a/tensorflow/core/common_runtime/gpu_device_context.h
+++ b/tensorflow/core/common_runtime/gpu_device_context.h
@@ -63,8 +63,8 @@ class GPUDeviceContext : public DeviceContext {
                              Device* device, Tensor* cpu_tensor,
                              StatusCallback done) override;
 
-  void MaintainLifetimeOnStream(
-      const Tensor* t, perftools::gputools::Stream* stream) const override {}
+  void MaintainLifetimeOnStream(const Tensor* t,
+                                se::Stream* stream) const override {}
 
  private:
   int stream_id_;
diff --git a/tensorflow/core/grappler/devices.cc b/tensorflow/core/grappler/devices.cc
index 2be894a08b..3268697671 100644
--- a/tensorflow/core/grappler/devices.cc
+++ b/tensorflow/core/grappler/devices.cc
@@ -31,15 +31,14 @@ int GetNumAvailableGPUs() {
   int num_eligible_gpus = 0;
 #if GOOGLE_CUDA
   if (ValidateGPUMachineManager().ok()) {
-    perftools::gputools::Platform* gpu_manager = GPUMachineManager();
+    se::Platform* gpu_manager = GPUMachineManager();
     if (gpu_manager != nullptr) {
       int num_gpus = gpu_manager->VisibleDeviceCount();
       for (int i = 0; i < num_gpus; i++) {
         auto exec_status = gpu_manager->ExecutorForDevice(i);
         if (exec_status.ok()) {
-          perftools::gputools::StreamExecutor* se = exec_status.ValueOrDie();
-          const perftools::gputools::DeviceDescription& desc =
-              se->GetDeviceDescription();
+          se::StreamExecutor* se = exec_status.ValueOrDie();
+          const se::DeviceDescription& desc = se->GetDeviceDescription();
           int min_gpu_core_count = 8;
           if (desc.core_count() >= min_gpu_core_count) {
             num_eligible_gpus++;
@@ -57,10 +56,9 @@ int GetNumAvailableGPUs() {
 int64 AvailableGPUMemory(int gpu_id) {
 #if GOOGLE_CUDA
   // Look up the device, to see its attributes.
-  perftools::gputools::Platform* gpu_platform = GPUMachineManager();
+  se::Platform* gpu_platform = GPUMachineManager();
   CHECK_LT(gpu_id, gpu_platform->VisibleDeviceCount());
-  perftools::gputools::StreamExecutor* se =
-      gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
+  se::StreamExecutor* se = gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
   int64 total_memory, available_memory;
   CHECK(se->DeviceMemoryUsage(&available_memory, &total_memory));
 
diff --git a/tensorflow/core/kernels/avgpooling_op.cc b/tensorflow/core/kernels/avgpooling_op.cc
index c581d1451f..ba38e1a188 100644
--- a/tensorflow/core/kernels/avgpooling_op.cc
+++ b/tensorflow/core/kernels/avgpooling_op.cc
@@ -156,10 +156,10 @@ class AvgPoolingOp<GPUDevice, T> : public UnaryOp<T> {
     TensorShape output_shape = params.forward_output_shape();
 
     if (data_format_ == FORMAT_NCHW) {
-      DnnPoolingOp<T>::Compute(
-          context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
-          stride_, padding_, data_format_, tensor_in, output_shape,
-          /*propagate_nans=*/false);
+      DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kAverage, ksize_,
+                               stride_, padding_, data_format_, tensor_in,
+                               output_shape,
+                               /*propagate_nans=*/false);
     } else {
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context,
@@ -417,10 +417,10 @@ class AvgPoolingGradOp<GPUDevice, T> : public OpKernel {
       output_shape.AddDim(shape_vec(i));
     }
 
-    DnnPoolingGradOp<T>::Compute(
-        context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
-        stride_, padding_, data_format_, nullptr, nullptr, out_backprop,
-        output_shape, /*propagate_nans=*/false);
+    DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kAverage,
+                                 ksize_, stride_, padding_, data_format_,
+                                 nullptr, nullptr, out_backprop, output_shape,
+                                 /*propagate_nans=*/false);
   }
 
  private:
@@ -547,10 +547,10 @@ class AvgPoolingGradOpCustomGPUKernel : public OpKernel {
                                 output->flat<T>().data(),       // bottom_diff
                                 context->eigen_gpu_device());   // d
     } else {
-      DnnPoolingGradOp<T>::Compute(
-          context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
-          stride_, padding_, data_format_, nullptr, nullptr, out_backprop,
-          output_shape, /*propagate_nans=*/false);
+      DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kAverage,
+                                   ksize_, stride_, padding_, data_format_,
+                                   nullptr, nullptr, out_backprop, output_shape,
+                                   /*propagate_nans=*/false);
     }
   }
 
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index 43e716c542..a1c03f9918 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -245,35 +245,35 @@ struct LaunchBatchMatMul<CPUDevice, Scalar> {
 
 namespace {
 template <typename T>
-perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
-  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
-  perftools::gputools::DeviceMemory<T> typed(wrapped);
+se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
+  se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
+  se::DeviceMemory<T> typed(wrapped);
   return typed;
 }
 
-class CublasScratchAllocator : public perftools::gputools::ScratchAllocator {
+class CublasScratchAllocator : public se::ScratchAllocator {
  public:
-  using Stream = ::perftools::gputools::Stream;
-  using DeviceMemoryBytes = ::perftools::gputools::DeviceMemory<uint8>;
+  using Stream = se::Stream;
+  using DeviceMemoryBytes = se::DeviceMemory<uint8>;
 
   CublasScratchAllocator(OpKernelContext* context) : context_(context) {}
 
   int64 GetMemoryLimitInBytes(Stream* stream) override { return -1; }
 
-  perftools::gputools::port::StatusOr<DeviceMemoryBytes> AllocateBytes(
+  se::port::StatusOr<DeviceMemoryBytes> AllocateBytes(
       Stream* stream, int64 byte_size) override {
     Tensor temporary_memory;
 
     Status allocation_status(context_->allocate_temp(
         DT_UINT8, TensorShape({byte_size}), &temporary_memory));
     if (!allocation_status.ok()) {
-      return perftools::gputools::port::StatusOr<DeviceMemoryBytes>(
+      return se::port::StatusOr<DeviceMemoryBytes>(
           DeviceMemoryBytes::MakeFromByteSize(nullptr, 0));
     }
     // Hold the reference of the allocated tensors until the end of the
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
-    return perftools::gputools::port::StatusOr<DeviceMemoryBytes>(
+    return se::port::StatusOr<DeviceMemoryBytes>(
         DeviceMemoryBytes::MakeFromByteSize(
             temporary_memory.flat<uint8>().data(),
             temporary_memory.flat<uint8>().size()));
@@ -289,12 +289,11 @@ template <typename Scalar>
 struct LaunchBatchMatMul<GPUDevice, Scalar> {
   static void Launch(OpKernelContext* context, const Tensor& in_x,
                      const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
-    constexpr perftools::gputools::blas::Transpose kTranspose =
-        is_complex<Scalar>::value
-            ? perftools::gputools::blas::Transpose::kConjugateTranspose
-            : perftools::gputools::blas::Transpose::kTranspose;
-    perftools::gputools::blas::Transpose trans[] = {
-        perftools::gputools::blas::Transpose::kNoTranspose, kTranspose};
+    constexpr se::blas::Transpose kTranspose =
+        is_complex<Scalar>::value ? se::blas::Transpose::kConjugateTranspose
+                                  : se::blas::Transpose::kTranspose;
+    se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose,
+                                   kTranspose};
     const uint64 m = in_x.dim_size(adj_x ? 2 : 1);
     const uint64 k = in_x.dim_size(adj_x ? 1 : 2);
     const uint64 n = in_y.dim_size(adj_y ? 1 : 2);
@@ -305,7 +304,7 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
     auto* stream = context->op_device_context()->stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
-    typedef perftools::gputools::DeviceMemory<Scalar> DeviceMemoryType;
+    typedef se::DeviceMemory<Scalar> DeviceMemoryType;
     std::vector<DeviceMemoryType> a_device_memory;
     std::vector<DeviceMemoryType> b_device_memory;
     std::vector<DeviceMemoryType> c_device_memory;
@@ -340,19 +339,16 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
       // This is a regular matrix*matrix or matrix*vector multiply. Avoid the
       // overhead of the scratch allocator and the batch interface.
       if (n == 1 &&
-          blas_transpose_b !=
-              perftools::gputools::blas::Transpose::kConjugateTranspose &&
-          blas_transpose_a !=
-              perftools::gputools::blas::Transpose::kConjugateTranspose) {
+          blas_transpose_b != se::blas::Transpose::kConjugateTranspose &&
+          blas_transpose_a != se::blas::Transpose::kConjugateTranspose) {
         // This is a matrix*vector multiply so use GEMV to compute A * b.
         // Here we are multiplying in the natural order, so we have to flip
         // the transposition flag to compensate for the tensor being stored
         // row-major. Since GEMV doesn't provide a way to just conjugate an
         // argument, we have to defer those cases to GEMM below.
-        auto gemv_trans_a =
-            blas_transpose_a == perftools::gputools::blas::Transpose::kTranspose
-                ? perftools::gputools::blas::Transpose::kNoTranspose
-                : perftools::gputools::blas::Transpose::kTranspose;
+        auto gemv_trans_a = blas_transpose_a == se::blas::Transpose::kTranspose
+                                ? se::blas::Transpose::kNoTranspose
+                                : se::blas::Transpose::kTranspose;
         bool blas_launch_status =
             stream
                 ->ThenBlasGemv(gemv_trans_a, adj_x ? m : k, adj_x ? k : m,
diff --git a/tensorflow/core/kernels/bias_op.cc b/tensorflow/core/kernels/bias_op.cc
index 368993c827..9fda7169a8 100644
--- a/tensorflow/core/kernels/bias_op.cc
+++ b/tensorflow/core/kernels/bias_op.cc
@@ -393,8 +393,8 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
     if (channel == 0) return;
     auto* stream = context->op_device_context()->stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
-    perftools::gputools::DeviceMemoryBase output_ptr(
-        output->flat<T>().data(), output->NumElements() * sizeof(T));
+    se::DeviceMemoryBase output_ptr(output->flat<T>().data(),
+                                    output->NumElements() * sizeof(T));
     stream->ThenMemZero(&output_ptr, output->NumElements() * sizeof(T));
     if (output_backprop.NumElements() > 0) {
       BiasGradGPU<T>::compute(context->template eigen_device<Device>(),
diff --git a/tensorflow/core/kernels/check_numerics_op.cc b/tensorflow/core/kernels/check_numerics_op.cc
index d3b67f4614..c3c0c50007 100644
--- a/tensorflow/core/kernels/check_numerics_op.cc
+++ b/tensorflow/core/kernels/check_numerics_op.cc
@@ -139,7 +139,7 @@ class CheckNumericsOp<GPUDevice, T> : public AsyncOpKernel {
     OP_REQUIRES_ASYNC(context, stream != nullptr,
                       errors::Internal("No GPU stream available."), done);
 
-    perftools::gputools::DeviceMemoryBase abnormal_detected_ptr(
+    se::DeviceMemoryBase abnormal_detected_ptr(
         abnormal_detected.flat<int>().data(),
         abnormal_detected.flat<int>().size());
     stream->ThenMemset32(&abnormal_detected_ptr, 0,
@@ -174,8 +174,8 @@ class CheckNumericsOp<GPUDevice, T> : public AsyncOpKernel {
     TensorReference abnormal_detected_ref(abnormal_detected);
     auto check_cb = [this, stream, abnormal_detected_ref,
                      abnormal_detected_host, context, done]() {
-      ::perftools::gputools::cuda::ScopedActivateExecutorContext
-          scoped_activation{stream->parent()};
+      se::cuda::ScopedActivateExecutorContext scoped_activation{
+          stream->parent()};
       auto abnormal_detected_host_flat = abnormal_detected_host.flat<int>();
       int is_nan = abnormal_detected_host_flat(0);
       int is_inf = abnormal_detected_host_flat(1);
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index f3b91494b9..ef1e73e5ab 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -532,7 +532,7 @@ struct ConvBackwardFilterAutoTuneGroup {
   static string name() { return "ConvBwdFilter"; }
 };
 typedef AutoTuneSingleton<ConvBackwardFilterAutoTuneGroup, ConvParameters,
-                          perftools::gputools::dnn::AlgorithmConfig>
+                          se::dnn::AlgorithmConfig>
     AutoTuneConvBwdFilter;
 
 // Backprop for filter.
@@ -636,9 +636,9 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
     const Tensor& out_backprop, const Tensor& input, int row_dilation,
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
     Tensor* filter_backprop, TensorFormat data_format) {
-  using perftools::gputools::dnn::AlgorithmConfig;
-  using perftools::gputools::dnn::AlgorithmDesc;
-  using perftools::gputools::dnn::ProfileResult;
+  using se::dnn::AlgorithmConfig;
+  using se::dnn::AlgorithmDesc;
+  using se::dnn::ProfileResult;
 
   std::vector<int32> dilations(4, 1);
   dilations[GetTensorDimIndex(data_format, 'H')] = row_dilation;
@@ -721,9 +721,9 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
 
     bool blas_launch_status =
         stream
-            ->ThenBlasGemm(perftools::gputools::blas::Transpose::kNoTranspose,
-                           perftools::gputools::blas::Transpose::kTranspose, n,
-                           m, k, 1.0f, a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n)
+            ->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
+                           se::blas::Transpose::kTranspose, n, m, k, 1.0f,
+                           a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n)
             .ok();
     if (!blas_launch_status) {
       ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
@@ -751,9 +751,9 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
 
     bool blas_launch_status =
         stream
-            ->ThenBlasGemm(perftools::gputools::blas::Transpose::kNoTranspose,
-                           perftools::gputools::blas::Transpose::kTranspose, n,
-                           m, k, 1.0f, b_ptr, n, a_ptr, m, 0.0f, &c_ptr, n)
+            ->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
+                           se::blas::Transpose::kTranspose, n, m, k, 1.0f,
+                           b_ptr, n, a_ptr, m, 0.0f, &c_ptr, n)
             .ok();
     if (!blas_launch_status) {
       ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
@@ -787,24 +787,24 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
   CHECK(padding_rows >= 0 && padding_cols >= 0)
       << "Negative row or col paddings: (" << padding_rows << ", "
       << padding_cols << ")";
-  perftools::gputools::dnn::BatchDescriptor input_desc;
+  se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(dims.batch_size)
       .set_height(GetTensorDim(compatible_input, data_format, 'H'))
       .set_width(GetTensorDim(compatible_input, data_format, 'W'))
       .set_feature_map_count(dims.in_depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::BatchDescriptor output_desc;
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::BatchDescriptor output_desc;
   output_desc.set_count(dims.batch_size)
       .set_height(dims.spatial_dims[0].output_size)
       .set_width(dims.spatial_dims[1].output_size)
       .set_feature_map_count(dims.out_depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::FilterDescriptor filter_desc;
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::FilterDescriptor filter_desc;
   filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size)
       .set_input_filter_width(dims.spatial_dims[1].filter_size)
       .set_input_feature_map_count(dims.in_depth)
       .set_output_feature_map_count(dims.out_depth);
-  perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+  se::dnn::ConvolutionDescriptor conv_desc;
   conv_desc.set_vertical_dilation_rate(dims.spatial_dims[0].dilation)
       .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
       .set_vertical_filter_stride(dims.spatial_dims[0].stride)
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 66d15c6e78..35f2676023 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -604,7 +604,7 @@ struct ConvBackwardDataAutoTuneGroup {
   static string name() { return "ConvBwdData"; }
 };
 typedef AutoTuneSingleton<ConvBackwardDataAutoTuneGroup, ConvParameters,
-                          perftools::gputools::dnn::AlgorithmConfig>
+                          se::dnn::AlgorithmConfig>
     AutoTuneConvBwdData;
 
 // Backprop for input.
@@ -705,9 +705,9 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     const Tensor& out_backprop, const Tensor& filter, int row_dilation,
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
     Tensor* in_backprop, TensorFormat data_format) {
-  using perftools::gputools::dnn::AlgorithmConfig;
-  using perftools::gputools::dnn::AlgorithmDesc;
-  using perftools::gputools::dnn::ProfileResult;
+  using se::dnn::AlgorithmConfig;
+  using se::dnn::AlgorithmDesc;
+  using se::dnn::ProfileResult;
 
   std::vector<int32> strides(4, 1);
   std::vector<int32> dilations(4, 1);
@@ -778,8 +778,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
                                 in_backprop->template flat<T>().size());
 
-    auto transpose = perftools::gputools::blas::Transpose::kTranspose;
-    auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+    auto transpose = se::blas::Transpose::kTranspose;
+    auto no_transpose = se::blas::Transpose::kNoTranspose;
 
     bool blas_launch_status =
         stream
@@ -810,8 +810,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
                                 in_backprop->template flat<T>().size());
 
-    auto transpose = perftools::gputools::blas::Transpose::kTranspose;
-    auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+    auto transpose = se::blas::Transpose::kTranspose;
+    auto no_transpose = se::blas::Transpose::kNoTranspose;
 
     bool blas_launch_status =
         stream
@@ -841,24 +841,24 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
   CHECK(padding_rows >= 0 && padding_cols >= 0)
       << "Negative row or col paddings: (" << padding_rows << ", "
       << padding_cols << ")";
-  perftools::gputools::dnn::BatchDescriptor input_desc;
+  se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(dims.batch_size)
       .set_height(GetTensorDim(compatible_input_shape, data_format, 'H'))
       .set_width(GetTensorDim(compatible_input_shape, data_format, 'W'))
       .set_feature_map_count(dims.in_depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::BatchDescriptor output_desc;
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::BatchDescriptor output_desc;
   output_desc.set_count(dims.batch_size)
       .set_height(dims.spatial_dims[0].output_size)
       .set_width(dims.spatial_dims[1].output_size)
       .set_feature_map_count(dims.out_depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::FilterDescriptor filter_desc;
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::FilterDescriptor filter_desc;
   filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size)
       .set_input_filter_width(dims.spatial_dims[1].filter_size)
       .set_input_feature_map_count(dims.in_depth)
       .set_output_feature_map_count(dims.out_depth);
-  perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+  se::dnn::ConvolutionDescriptor conv_desc;
   conv_desc.set_vertical_dilation_rate(dims.spatial_dims[0].dilation)
       .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
       .set_vertical_filter_stride(dims.spatial_dims[0].stride)
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 092e859a5b..9edc6d416e 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -35,7 +35,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
-using perftools::gputools::dnn::DimIndex;
+using stream_executor::dnn::DimIndex;
 #endif
 
 namespace tensorflow {
@@ -468,7 +468,7 @@ struct Conv3dBackwardDataAutoTuneGroup {
   static string name() { return "Conv3dBwdData"; }
 };
 typedef AutoTuneSingleton<Conv3dBackwardDataAutoTuneGroup, ConvParameters,
-                          perftools::gputools::dnn::AlgorithmConfig>
+                          se::dnn::AlgorithmConfig>
 
     AutoTuneConv3dBwdData;
 template <typename T>
@@ -554,8 +554,8 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
       auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
                                   in_backprop->template flat<T>().size());
 
-      auto transpose = perftools::gputools::blas::Transpose::kTranspose;
-      auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+      auto transpose = se::blas::Transpose::kTranspose;
+      auto no_transpose = se::blas::Transpose::kNoTranspose;
 
       bool blas_launch_status =
           stream
@@ -582,8 +582,8 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
       auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
                                   in_backprop->template flat<T>().size());
 
-      auto transpose = perftools::gputools::blas::Transpose::kTranspose;
-      auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+      auto transpose = se::blas::Transpose::kTranspose;
+      auto no_transpose = se::blas::Transpose::kNoTranspose;
 
       bool blas_launch_status =
           stream
@@ -629,27 +629,27 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     CHECK(padding_rows >= 0 && padding_cols >= 0 && padding_planes >= 0)
         << "Negative paddings: (" << padding_rows << ", " << padding_cols
         << ", " << padding_planes << ")";
-    perftools::gputools::dnn::BatchDescriptor input_desc(3);
+    se::dnn::BatchDescriptor input_desc(3);
     input_desc.set_count(batch)
         .set_spatial_dim(DimIndex::X, compatible_input_shape.dim_size(4))
         .set_spatial_dim(DimIndex::Y, compatible_input_shape.dim_size(3))
         .set_spatial_dim(DimIndex::Z, compatible_input_shape.dim_size(2))
         .set_feature_map_count(in_depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::BatchDescriptor output_desc(3);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+    se::dnn::BatchDescriptor output_desc(3);
     output_desc.set_count(batch)
         .set_spatial_dim(DimIndex::X, output_cols)
         .set_spatial_dim(DimIndex::Y, output_rows)
         .set_spatial_dim(DimIndex::Z, output_planes)
         .set_feature_map_count(out_depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::FilterDescriptor filter_desc(3);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+    se::dnn::FilterDescriptor filter_desc(3);
     filter_desc.set_spatial_dim(DimIndex::X, filter_size[2])
         .set_spatial_dim(DimIndex::Y, filter_size[1])
         .set_spatial_dim(DimIndex::Z, filter_size[0])
         .set_input_feature_map_count(in_depth)
         .set_output_feature_map_count(out_depth);
-    perftools::gputools::dnn::ConvolutionDescriptor conv_desc(3);
+    se::dnn::ConvolutionDescriptor conv_desc(3);
     conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
         .set_dilation_rate(DimIndex::Y, dilations[1])
         .set_dilation_rate(DimIndex::Z, dilations[0])
@@ -725,9 +725,9 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
         device_id,
     };
 
-    using perftools::gputools::dnn::AlgorithmConfig;
-    using perftools::gputools::dnn::AlgorithmDesc;
-    using perftools::gputools::dnn::ProfileResult;
+    using se::dnn::AlgorithmConfig;
+    using se::dnn::AlgorithmDesc;
+    using se::dnn::ProfileResult;
     AlgorithmConfig algorithm_config;
     if (cudnn_use_autotune_ && !AutoTuneConv3dBwdData::GetInstance()->Find(
                                    conv_parameters, &algorithm_config)) {
@@ -839,7 +839,7 @@ struct Conv3dBackwardFilterAutoTuneGroup {
   static string name() { return "Conv3dBwdFilter"; }
 };
 typedef AutoTuneSingleton<Conv3dBackwardFilterAutoTuneGroup, ConvParameters,
-                          perftools::gputools::dnn::AlgorithmConfig>
+                          se::dnn::AlgorithmConfig>
     AutoTuneConv3dBwdFilter;
 
 template <typename T>
@@ -941,9 +941,9 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
 
       bool blas_launch_status =
           stream
-              ->ThenBlasGemm(perftools::gputools::blas::Transpose::kNoTranspose,
-                             perftools::gputools::blas::Transpose::kTranspose,
-                             n, m, k, 1.0f, a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n)
+              ->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
+                             se::blas::Transpose::kTranspose, n, m, k, 1.0f,
+                             a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n)
               .ok();
       if (!blas_launch_status) {
         context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
@@ -967,9 +967,9 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
 
       bool blas_launch_status =
           stream
-              ->ThenBlasGemm(perftools::gputools::blas::Transpose::kNoTranspose,
-                             perftools::gputools::blas::Transpose::kTranspose,
-                             n, m, k, 1.0f, b_ptr, n, a_ptr, m, 0.0f, &c_ptr, n)
+              ->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
+                             se::blas::Transpose::kTranspose, n, m, k, 1.0f,
+                             b_ptr, n, a_ptr, m, 0.0f, &c_ptr, n)
               .ok();
       if (!blas_launch_status) {
         context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
@@ -1014,7 +1014,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
     CHECK(padding_rows >= 0 && padding_cols >= 0 && padding_planes >= 0)
         << "Negative paddings: (" << padding_rows << ", " << padding_cols
         << ", " << padding_planes << ")";
-    perftools::gputools::dnn::BatchDescriptor input_desc(3);
+    se::dnn::BatchDescriptor input_desc(3);
     input_desc.set_count(batch)
         .set_spatial_dim(DimIndex::X,
                          GetTensorDim(compatible_input, data_format_, '2'))
@@ -1023,21 +1023,21 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         .set_spatial_dim(DimIndex::Z,
                          GetTensorDim(compatible_input, data_format_, '0'))
         .set_feature_map_count(in_depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::BatchDescriptor output_desc(3);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+    se::dnn::BatchDescriptor output_desc(3);
     output_desc.set_count(batch)
         .set_spatial_dim(DimIndex::X, output_cols)
         .set_spatial_dim(DimIndex::Y, output_rows)
         .set_spatial_dim(DimIndex::Z, output_planes)
         .set_feature_map_count(out_depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::FilterDescriptor filter_desc(3);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+    se::dnn::FilterDescriptor filter_desc(3);
     filter_desc.set_spatial_dim(DimIndex::X, filter_size[2])
         .set_spatial_dim(DimIndex::Y, filter_size[1])
         .set_spatial_dim(DimIndex::Z, filter_size[0])
         .set_input_feature_map_count(in_depth)
         .set_output_feature_map_count(out_depth);
-    perftools::gputools::dnn::ConvolutionDescriptor conv_desc(3);
+    se::dnn::ConvolutionDescriptor conv_desc(3);
     conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
         .set_dilation_rate(DimIndex::Y, dilations[1])
         .set_dilation_rate(DimIndex::Z, dilations[0])
@@ -1121,9 +1121,9 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         device_id,
     };
 
-    using perftools::gputools::dnn::AlgorithmConfig;
-    using perftools::gputools::dnn::AlgorithmDesc;
-    using perftools::gputools::dnn::ProfileResult;
+    using se::dnn::AlgorithmConfig;
+    using se::dnn::AlgorithmDesc;
+    using se::dnn::ProfileResult;
     AlgorithmConfig algorithm_config;
     if (cudnn_use_autotune_ && !AutoTuneConv3dBwdFilter::GetInstance()->Find(
                                    conv_parameters, &algorithm_config)) {
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index f0888c655f..c6d36b40fe 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -475,7 +475,7 @@ struct ConvAutoTuneGroup {
   static string name() { return "Conv"; }
 };
 typedef AutoTuneSingleton<ConvAutoTuneGroup, ConvParameters,
-                          perftools::gputools::dnn::AlgorithmConfig>
+                          se::dnn::AlgorithmConfig>
     AutoTuneConv;
 
 template <typename T>
@@ -484,9 +484,9 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     const Tensor& input_param, const Tensor& filter, int row_dilation,
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
     Tensor* output, TensorFormat data_format) {
-  using perftools::gputools::dnn::AlgorithmConfig;
-  using perftools::gputools::dnn::AlgorithmDesc;
-  using perftools::gputools::dnn::ProfileResult;
+  using se::dnn::AlgorithmConfig;
+  using se::dnn::AlgorithmDesc;
+  using se::dnn::ProfileResult;
   auto* stream = ctx->op_device_context()->stream();
   OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
 
@@ -514,7 +514,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
                                 output->template flat<T>().size());
 
-    auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+    auto no_transpose = se::blas::Transpose::kNoTranspose;
     bool blas_launch_status =
         stream
             ->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr, n,
@@ -543,7 +543,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
                                 output->template flat<T>().size());
 
-    auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+    auto no_transpose = se::blas::Transpose::kNoTranspose;
     bool blas_launch_status =
         stream
             ->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr, n,
@@ -629,24 +629,24 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   CHECK(padding_rows >= 0 && padding_cols >= 0)
       << "Negative row or col paddings: (" << padding_rows << ", "
       << padding_cols << ")";
-  perftools::gputools::dnn::BatchDescriptor input_desc;
+  se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(in_batch)
       .set_feature_map_count(in_depths)
       .set_height(in_rows)
       .set_width(in_cols)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::BatchDescriptor output_desc;
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::BatchDescriptor output_desc;
   output_desc.set_count(out_batch)
       .set_height(out_rows)
       .set_width(out_cols)
       .set_feature_map_count(out_depths)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::FilterDescriptor filter_desc;
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::FilterDescriptor filter_desc;
   filter_desc.set_input_filter_height(filter.dim_size(0))
       .set_input_filter_width(filter.dim_size(1))
       .set_input_feature_map_count(filter.dim_size(2))
       .set_output_feature_map_count(filter.dim_size(3));
-  perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+  se::dnn::ConvolutionDescriptor conv_desc;
   conv_desc.set_vertical_dilation_rate(row_dilation)
       .set_horizontal_dilation_rate(col_dilation)
       .set_vertical_filter_stride(row_stride)
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 48dd3c9eb0..9ec16be67d 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -34,7 +34,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
-using perftools::gputools::dnn::DimIndex;
+using stream_executor::dnn::DimIndex;
 #endif
 
 namespace tensorflow {
@@ -192,7 +192,7 @@ struct Conv3dAutoTuneGroup {
   static string name() { return "Conv3d"; }
 };
 typedef AutoTuneSingleton<Conv3dAutoTuneGroup, ConvParameters,
-                          perftools::gputools::dnn::AlgorithmConfig>
+                          se::dnn::AlgorithmConfig>
     AutoTuneConv3d;
 
 // TODO(mjanusz): Share logic with 2d implementation as much as possible.
@@ -250,7 +250,7 @@ struct LaunchConvOp<GPUDevice, T> {
       auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
                                   output->template flat<T>().size());
 
-      auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+      auto no_transpose = se::blas::Transpose::kNoTranspose;
       bool blas_launch_status =
           stream
               ->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr,
@@ -277,7 +277,7 @@ struct LaunchConvOp<GPUDevice, T> {
       auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
                                   output->template flat<T>().size());
 
-      auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+      auto no_transpose = se::blas::Transpose::kNoTranspose;
       bool blas_launch_status =
           stream
               ->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr,
@@ -346,27 +346,27 @@ struct LaunchConvOp<GPUDevice, T> {
     CHECK(pad_rows >= 0 && pad_cols >= 0 && pad_planes >= 0)
         << "Negative paddings: (" << pad_rows << ", " << pad_cols << ", "
         << pad_planes << ")";
-    perftools::gputools::dnn::BatchDescriptor input_desc(3);
+    se::dnn::BatchDescriptor input_desc(3);
     input_desc.set_count(in_batch)
         .set_feature_map_count(in_depth)
         .set_spatial_dim(DimIndex::X, in_cols)
         .set_spatial_dim(DimIndex::Y, in_rows)
         .set_spatial_dim(DimIndex::Z, in_planes)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::BatchDescriptor output_desc(3);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+    se::dnn::BatchDescriptor output_desc(3);
     output_desc.set_count(in_batch)
         .set_spatial_dim(DimIndex::X, out_cols)
         .set_spatial_dim(DimIndex::Y, out_rows)
         .set_spatial_dim(DimIndex::Z, out_planes)
         .set_feature_map_count(out_depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::FilterDescriptor filter_desc(3);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+    se::dnn::FilterDescriptor filter_desc(3);
     filter_desc.set_spatial_dim(DimIndex::X, filter_cols)
         .set_spatial_dim(DimIndex::Y, filter_rows)
         .set_spatial_dim(DimIndex::Z, filter_planes)
         .set_input_feature_map_count(in_depth)
         .set_output_feature_map_count(out_depth);
-    perftools::gputools::dnn::ConvolutionDescriptor conv_desc(3);
+    se::dnn::ConvolutionDescriptor conv_desc(3);
     conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
         .set_dilation_rate(DimIndex::Y, dilations[1])
         .set_dilation_rate(DimIndex::Z, dilations[0])
@@ -424,9 +424,9 @@ struct LaunchConvOp<GPUDevice, T> {
         device_id,
     };
 
-    using perftools::gputools::dnn::AlgorithmConfig;
-    using perftools::gputools::dnn::AlgorithmDesc;
-    using perftools::gputools::dnn::ProfileResult;
+    using se::dnn::AlgorithmConfig;
+    using se::dnn::AlgorithmDesc;
+    using se::dnn::ProfileResult;
 
     AlgorithmConfig algorithm_config;
 
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index 7f9cfec981..4215c4541c 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -36,25 +36,23 @@ int64 GetCudnnWorkspaceLimit(const string& envvar_in_mb,
 // A class to provide scratch-space allocator for Stream-Executor Cudnn
 // callback. TensorFlow is responsible for releasing the temporary buffers after
 // the kernel finishes.
-class CudnnScratchAllocator : public perftools::gputools::ScratchAllocator {
+class CudnnScratchAllocator : public se::ScratchAllocator {
  public:
   virtual ~CudnnScratchAllocator() {}
   CudnnScratchAllocator(int64 memory_limit, OpKernelContext* context)
       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
-  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
+  int64 GetMemoryLimitInBytes(se::Stream* stream) override {
     return memory_limit_;
   }
-  perftools::gputools::port::StatusOr<perftools::gputools::DeviceMemory<uint8>>
-  AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override {
+  se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
+      se::Stream* stream, int64 byte_size) override {
     Tensor temporary_memory;
     if (byte_size < 0) {
-      return perftools::gputools::port::Status{
-          perftools::gputools::port::error::INVALID_ARGUMENT,
-          "Requested negative byte size!"};
+      return se::port::Status{se::port::error::INVALID_ARGUMENT,
+                              "Requested negative byte size!"};
     }
     if (byte_size > memory_limit_) {
-      return perftools::gputools::port::StatusOr<
-          perftools::gputools::DeviceMemory<uint8>>();
+      return se::port::StatusOr<se::DeviceMemory<uint8>>();
     }
     AllocationAttributes allocation_attr;
     allocation_attr.no_retry_on_failure = true;
@@ -62,15 +60,13 @@ class CudnnScratchAllocator : public perftools::gputools::ScratchAllocator {
         DT_UINT8, TensorShape({byte_size}), &temporary_memory,
         AllocatorAttributes(), allocation_attr));
     if (!allocation_status.ok()) {
-      return perftools::gputools::port::StatusOr<
-          perftools::gputools::DeviceMemory<uint8>>();
+      return se::port::StatusOr<se::DeviceMemory<uint8>>();
     }
     // Hold the reference of the allocated tensors until the end of the
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return perftools::gputools::port::StatusOr<
-        perftools::gputools::DeviceMemory<uint8>>(
+    return se::port::StatusOr<se::DeviceMemory<uint8>>(
         AsDeviceMemory(temporary_memory.flat<uint8>().data(),
                        temporary_memory.flat<uint8>().size()));
   }
@@ -141,9 +137,9 @@ class ConvParameters {
   // for certain input parameters so as to avoid a bug in cuDNNv5 and cuDNNv6.
   template <typename T>
   bool ShouldIncludeWinogradNonfusedAlgo(
-      perftools::gputools::StreamExecutor* stream_exec) const {
+      se::StreamExecutor* stream_exec) const {
     // Skip this check for cuDNN 7 and newer.
-    perftools::gputools::port::StatusOr<std::tuple<int, int, int>> version =
+    se::port::StatusOr<std::tuple<int, int, int>> version =
         stream_exec->AsDnn()->GetVersion();
     if (version.ok() && std::get<0>(version.ValueOrDie()) >= 7) {
       return true;
diff --git a/tensorflow/core/kernels/crop_and_resize_op.cc b/tensorflow/core/kernels/crop_and_resize_op.cc
index 45cc2fbbb8..54ef9c6fb4 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op.cc
@@ -39,17 +39,16 @@ limitations under the License.
 #include "tensorflow/core/platform/cuda.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
-using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
+using stream_executor::cuda::ScopedActivateExecutorContext;
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
+namespace {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 using Callback = std::function<void()>;
 
-namespace {
-
 static inline Status ParseAndCheckBoxSizes(const Tensor& boxes,
                                            const Tensor& box_index,
                                            int* num_boxes) {
@@ -753,8 +752,7 @@ inline void RunIfBoxIndexIsValid<GPUDevice>(
       context->allocate_temp(DataTypeToEnum<bool>::value, TensorShape({}),
                              &isvalid_host_tensor, alloc_attr),
       done);
-  perftools::gputools::DeviceMemoryBase wrapped(isvalid_dev.data(),
-                                                sizeof(bool));
+  se::DeviceMemoryBase wrapped(isvalid_dev.data(), sizeof(bool));
   const bool status =
       stream
           ->ThenMemcpy(
diff --git a/tensorflow/core/kernels/cuda_device_array.h b/tensorflow/core/kernels/cuda_device_array.h
index e7a5db0683..74dc298c7a 100644
--- a/tensorflow/core/kernels/cuda_device_array.h
+++ b/tensorflow/core/kernels/cuda_device_array.h
@@ -80,7 +80,7 @@ class CudaDeviceArrayOnHost {
     TensorReference tensor_ref(out_of_line_values_on_host_);
     TF_RETURN_IF_ERROR(context_->allocate_temp(
         DT_INT8, TensorShape{total_bytes_}, &out_of_line_values_on_gpu_));
-    perftools::gputools::DeviceMemoryBase output_values_base{
+    se::DeviceMemoryBase output_values_base{
         out_of_line_values_on_gpu_.flat<int8>().data(),
         static_cast<uint64>(total_bytes_)};
     stream->ThenMemcpy(&output_values_base,
diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc
index 6cec032f94..a857bd3ce4 100644
--- a/tensorflow/core/kernels/cuda_solvers.cc
+++ b/tensorflow/core/kernels/cuda_solvers.cc
@@ -35,8 +35,6 @@
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
 
-using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
-
 // The CUDA cublas_api.h API contains const-correctness errors. Instead of
 // casting away constness on our data, we instead reinterpret the CuBLAS
 // functions as what they were clearly meant to be, and thus we can call
@@ -80,10 +78,12 @@ using matinv_Z = cublasStatus_t(cublasContext*, int, const double2* const*, int,
 namespace tensorflow {
 namespace {
 
+using se::cuda::ScopedActivateExecutorContext;
+
 inline bool CopyHostToDevice(OpKernelContext* context, void* dst,
                              const void* src, uint64 bytes) {
   auto stream = context->op_device_context()->stream();
-  perftools::gputools::DeviceMemoryBase wrapped_dst(dst);
+  se::DeviceMemoryBase wrapped_dst(dst);
   return stream->ThenMemcpy(&wrapped_dst, src, bytes).ok();
 }
 
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h
index ecfa23750c..b2e8ee23a9 100644
--- a/tensorflow/core/kernels/cuda_solvers.h
+++ b/tensorflow/core/kernels/cuda_solvers.h
@@ -398,7 +398,7 @@ class DeviceLapackInfo : public ScratchSpace<int> {
     CHECK(success != nullptr);
     HostLapackInfo copy(context(), size(), debug_info());
     auto stream = context()->op_device_context()->stream();
-    perftools::gputools::DeviceMemoryBase wrapped_src(
+    se::DeviceMemoryBase wrapped_src(
         static_cast<void*>(const_cast<int*>(this->data())));
     *success =
         stream->ThenMemcpy(copy.mutable_data(), wrapped_src, this->bytes())
diff --git a/tensorflow/core/kernels/cudnn_pooling_gpu.cc b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
index 5939ecdf62..d2b9c9edaa 100644
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.cc
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
@@ -31,12 +31,13 @@ namespace tensorflow {
 #if GOOGLE_CUDA
 
 template <typename T>
-void DnnPooling3dOp<T>::Compute(
-    OpKernelContext* context,
-    perftools::gputools::dnn::PoolingMode pooling_mode,
-    const std::array<int64, 3>& window, const std::array<int64, 3>& stride,
-    const std::array<int64, 3>& padding, TensorFormat data_format,
-    const Tensor& tensor_in, Tensor* output) {
+void DnnPooling3dOp<T>::Compute(OpKernelContext* context,
+                                se::dnn::PoolingMode pooling_mode,
+                                const std::array<int64, 3>& window,
+                                const std::array<int64, 3>& stride,
+                                const std::array<int64, 3>& padding,
+                                TensorFormat data_format,
+                                const Tensor& tensor_in, Tensor* output) {
   const auto in_shape = tensor_in.shape();
   const auto out_shape = output->shape();
 
@@ -67,18 +68,18 @@ void DnnPooling3dOp<T>::Compute(
     transformed_output = *output;
   }
 
-  perftools::gputools::dnn::PoolingDescriptor pooling_desc(3);
+  se::dnn::PoolingDescriptor pooling_desc(3);
   pooling_desc.set_pooling_mode(pooling_mode);
-  perftools::gputools::dnn::BatchDescriptor input_desc(3);
+  se::dnn::BatchDescriptor input_desc(3);
   input_desc.set_count(in_batch)
       .set_feature_map_count(in_features)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::BatchDescriptor output_desc(3);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::BatchDescriptor output_desc(3);
   output_desc.set_count(in_batch)
       .set_feature_map_count(in_features)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
   for (size_t i = 0; i < window.size(); ++i) {
-    const auto dim_i = static_cast<perftools::gputools::dnn::DimIndex>(i);
+    const auto dim_i = static_cast<se::dnn::DimIndex>(i);
     pooling_desc.set_window(dim_i, window[i]);
     pooling_desc.set_stride(dim_i, stride[i]);
     pooling_desc.set_padding(dim_i, padding[i]);
@@ -115,14 +116,13 @@ void DnnPooling3dOp<T>::Compute(
 
 template <typename T>
 void DnnPooling3dGradOp<T>::Compute(
-    OpKernelContext* context,
-    perftools::gputools::dnn::PoolingMode pooling_mode,
+    OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
     const std::array<int64, 3>& window, const std::array<int64, 3>& stride,
     const std::array<int64, 3>& padding,
     const std::array<int64, 3>& output_size, TensorFormat data_format,
     const Tensor& out_backprop, const TensorShape& tensor_in_shape,
     const Tensor* tensor_in, const Tensor* tensor_out, Tensor* input_backprop) {
-  CHECK((pooling_mode != perftools::gputools::dnn::PoolingMode::kMaximum) ||
+  CHECK((pooling_mode != se::dnn::PoolingMode::kMaximum) ||
         (tensor_in && tensor_out))
       << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
          "specified";
@@ -186,21 +186,21 @@ void DnnPooling3dGradOp<T>::Compute(
         transformed_output_backprop.tensor<T, 5>());
   }
 
-  perftools::gputools::dnn::PoolingDescriptor pooling_desc(3);
+  se::dnn::PoolingDescriptor pooling_desc(3);
   pooling_desc.set_pooling_mode(pooling_mode);
 
-  perftools::gputools::dnn::BatchDescriptor orig_output_desc(3);
+  se::dnn::BatchDescriptor orig_output_desc(3);
   orig_output_desc.set_count(in_batch)
       .set_feature_map_count(in_features)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
-  perftools::gputools::dnn::BatchDescriptor orig_input_desc(3);
+  se::dnn::BatchDescriptor orig_input_desc(3);
   orig_input_desc.set_count(in_batch)
       .set_feature_map_count(in_features)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
   for (size_t i = 0; i < window.size(); ++i) {
-    const auto dim_i = static_cast<perftools::gputools::dnn::DimIndex>(i);
+    const auto dim_i = static_cast<se::dnn::DimIndex>(i);
     pooling_desc.set_window(dim_i, window[i]);
     pooling_desc.set_stride(dim_i, stride[i]);
     pooling_desc.set_padding(dim_i, padding[i]);
diff --git a/tensorflow/core/kernels/cudnn_pooling_gpu.h b/tensorflow/core/kernels/cudnn_pooling_gpu.h
index ff4de75845..280d697fc2 100644
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.h
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.h
@@ -38,7 +38,7 @@ template <typename T>
 class DnnPooling3dOp {
  public:
   static void Compute(OpKernelContext* context,
-                      perftools::gputools::dnn::PoolingMode pooling_mode,
+                      se::dnn::PoolingMode pooling_mode,
                       const std::array<int64, 3>& size,
                       const std::array<int64, 3>& stride,
                       const std::array<int64, 3>& padding,
@@ -52,7 +52,7 @@ template <typename T>
 class DnnPooling3dGradOp {
  public:
   static void Compute(OpKernelContext* context,
-                      perftools::gputools::dnn::PoolingMode pooling_mode,
+                      se::dnn::PoolingMode pooling_mode,
                       const std::array<int64, 3>& window,
                       const std::array<int64, 3>& stride,
                       const std::array<int64, 3>& padding,
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index a21f13a4dd..762c2c3666 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -78,7 +78,7 @@ using CPUDevice = Eigen::ThreadPoolDevice;
 #if GOOGLE_CUDA
 
 using GPUDevice = Eigen::GpuDevice;
-using ::perftools::gputools::StreamExecutor;
+using se::StreamExecutor;
 
 template <typename Device, typename T, typename Index>
 class CudnnRNNParamsSizeOp;
@@ -102,21 +102,21 @@ enum class TFRNNInputMode {
 };
 
 namespace {
-using ::perftools::gputools::DeviceMemory;
-using ::perftools::gputools::DeviceMemoryBase;
-using ::perftools::gputools::ScratchAllocator;
-using ::perftools::gputools::Stream;
-using ::perftools::gputools::dnn::AlgorithmConfig;
-using ::perftools::gputools::dnn::AlgorithmDesc;
-using ::perftools::gputools::dnn::ProfileResult;
-using ::perftools::gputools::dnn::RnnDescriptor;
-using ::perftools::gputools::dnn::RnnDirectionMode;
-using ::perftools::gputools::dnn::RnnInputMode;
-using ::perftools::gputools::dnn::RnnMode;
-using ::perftools::gputools::dnn::RnnSequenceTensorDescriptor;
-using ::perftools::gputools::dnn::RnnStateTensorDescriptor;
-using ::perftools::gputools::dnn::ToDataType;
-using ::perftools::gputools::port::StatusOr;
+using se::DeviceMemory;
+using se::DeviceMemoryBase;
+using se::ScratchAllocator;
+using se::Stream;
+using se::dnn::AlgorithmConfig;
+using se::dnn::AlgorithmDesc;
+using se::dnn::ProfileResult;
+using se::dnn::RnnDescriptor;
+using se::dnn::RnnDirectionMode;
+using se::dnn::RnnInputMode;
+using se::dnn::RnnMode;
+using se::dnn::RnnSequenceTensorDescriptor;
+using se::dnn::RnnStateTensorDescriptor;
+using se::dnn::ToDataType;
+using se::port::StatusOr;
 
 Status ParseRNNMode(const string& str, RnnMode* rnn_mode) {
   if (str == "rnn_relu") {
@@ -213,7 +213,7 @@ DeviceMemoryBase SliceDeviceMemory(const DeviceMemoryBase& device_memory,
   return DeviceMemoryBase(offset_ptr, size);
 }
 
-inline Status FromExecutorStatus(const perftools::gputools::port::Status& s) {
+inline Status FromExecutorStatus(const se::port::Status& s) {
   return s.ok() ? Status::OK()
                 : Status(static_cast<tensorflow::error::Code>(
                              static_cast<int>(s.code())),
@@ -221,17 +221,15 @@ inline Status FromExecutorStatus(const perftools::gputools::port::Status& s) {
 }
 
 template <typename T>
-inline Status FromExecutorStatus(
-    const perftools::gputools::port::StatusOr<T>& s) {
+inline Status FromExecutorStatus(const se::port::StatusOr<T>& s) {
   return FromExecutorStatus(s.status());
 }
 
-inline perftools::gputools::port::Status ToExecutorStatus(const Status& s) {
-  return s.ok() ? perftools::gputools::port::Status::OK()
-                : perftools::gputools::port::Status(
-                      static_cast<perftools::gputools::port::error::Code>(
-                          static_cast<int>(s.code())),
-                      s.error_message());
+inline se::port::Status ToExecutorStatus(const Status& s) {
+  return s.ok() ? se::port::Status::OK()
+                : se::port::Status(static_cast<se::port::error::Code>(
+                                       static_cast<int>(s.code())),
+                                   s.error_message());
 }
 
 template <typename>
@@ -503,7 +501,7 @@ Status CreateForwardAndBackwardIODescriptors(
     std::unique_ptr<RnnStateTensorDescriptor>* state_desc,
     std::unique_ptr<RnnSequenceTensorDescriptor>* output_desc) {
   StreamExecutor* executor = context->op_device_context()->stream()->parent();
-  ::perftools::gputools::dnn::DataType data_type = ToDataType<T>::value;
+  se::dnn::DataType data_type = ToDataType<T>::value;
 
   const TensorShape& input_shape = model_shapes.input_shape;
   const TensorShape& hidden_state_shape = model_shapes.hidden_state_shape;
@@ -773,7 +771,7 @@ class CudnnRNNKernelCommon : public OpKernel {
                              ScratchAllocator* dropout_state_allocator,
                              std::unique_ptr<RnnDescriptor>* rnn_desc) {
     StreamExecutor* executor = context->op_device_context()->stream()->parent();
-    ::perftools::gputools::dnn::DataType data_type = ToDataType<T>::value;
+    se::dnn::DataType data_type = ToDataType<T>::value;
     auto rnn_desc_s = executor->createRnnDescriptor(
         model_shapes.num_layers, model_shapes.num_units,
         model_shapes.input_size, input_mode, rnn_direction_mode(), rnn_mode(),
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 94989089ec..0abd64030f 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -1708,8 +1708,7 @@ void LaunchDepthwiseConvBackpropFilterOp<GpuDevice, T>::operator()(
   // Initialize the results to 0.
   int num_filter_backprop =
       args.filter_rows * args.filter_cols * args.out_depth;
-  perftools::gputools::DeviceMemoryBase filter_bp_ptr(filter_backprop,
-                                                      num_filter_backprop);
+  se::DeviceMemoryBase filter_bp_ptr(filter_backprop, num_filter_backprop);
   stream->ThenMemset32(&filter_bp_ptr, 0, num_filter_backprop * sizeof(T));
 
   if (args.filter_rows == 3 && args.filter_cols == 3) {
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index 9dfeccff0e..862a97723f 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -285,8 +285,8 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
         c->allocate_temp(partition_count.dtype(), partition_count.shape(),
                          &cpu_tensor, alloc_attr),
         done);
-    perftools::gputools::DeviceMemoryBase wrapped(
-        partition_count.flat<int32>().data(), num_partitions_ * sizeof(int32));
+    se::DeviceMemoryBase wrapped(partition_count.flat<int32>().data(),
+                                 num_partitions_ * sizeof(int32));
     const bool status =
         stream
             ->ThenMemcpy(cpu_tensor.flat<int32>().data(), wrapped,
diff --git a/tensorflow/core/kernels/fft_ops.cc b/tensorflow/core/kernels/fft_ops.cc
index ab5af8caad..661bf5fc5f 100644
--- a/tensorflow/core/kernels/fft_ops.cc
+++ b/tensorflow/core/kernels/fft_ops.cc
@@ -277,20 +277,19 @@ REGISTER_KERNEL_BUILDER(Name("IRFFT3D").Device(DEVICE_CPU).Label(FFT_LABEL),
 #undef FFT_LABEL
 
 #if GOOGLE_CUDA
-namespace gpu = ::perftools::gputools;
 
 namespace {
 template <typename T>
-gpu::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
-  gpu::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
-  gpu::DeviceMemory<T> typed(wrapped);
+se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
+  se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
+  se::DeviceMemory<T> typed(wrapped);
   return typed;
 }
 
 template <typename T>
-gpu::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
-  gpu::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), size * sizeof(T));
-  gpu::DeviceMemory<T> typed(wrapped);
+se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
+  se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), size * sizeof(T));
+  se::DeviceMemory<T> typed(wrapped);
   return typed;
 }
 
@@ -299,19 +298,19 @@ gpu::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
 // the kernel finishes.
 // TODO(yangzihao): Refactor redundant code in subclasses of ScratchAllocator
 // into base class.
-class CufftScratchAllocator : public gpu::ScratchAllocator {
+class CufftScratchAllocator : public se::ScratchAllocator {
  public:
   ~CufftScratchAllocator() override {}
   CufftScratchAllocator(int64 memory_limit, OpKernelContext* context)
       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
-  int64 GetMemoryLimitInBytes(gpu::Stream* stream) override {
+  int64 GetMemoryLimitInBytes(se::Stream* stream) override {
     return memory_limit_;
   }
-  gpu::port::StatusOr<gpu::DeviceMemory<uint8>> AllocateBytes(
-      gpu::Stream* stream, int64 byte_size) override {
+  se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
+      se::Stream* stream, int64 byte_size) override {
     Tensor temporary_memory;
     if (byte_size > memory_limit_) {
-      return gpu::port::StatusOr<gpu::DeviceMemory<uint8>>();
+      return se::port::StatusOr<se::DeviceMemory<uint8>>();
     }
     AllocationAttributes allocation_attr;
     allocation_attr.no_retry_on_failure = true;
@@ -319,13 +318,13 @@ class CufftScratchAllocator : public gpu::ScratchAllocator {
         DT_UINT8, TensorShape({byte_size}), &temporary_memory,
         AllocatorAttributes(), allocation_attr));
     if (!allocation_status.ok()) {
-      return gpu::port::StatusOr<gpu::DeviceMemory<uint8>>();
+      return se::port::StatusOr<se::DeviceMemory<uint8>>();
     }
     // Hold the reference of the allocated tensors until the end of the
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return gpu::port::StatusOr<gpu::DeviceMemory<uint8>>(
+    return se::port::StatusOr<se::DeviceMemory<uint8>>(
         AsDeviceMemory(temporary_memory.flat<uint8>().data(),
                        temporary_memory.flat<uint8>().size()));
   }
@@ -394,9 +393,9 @@ class FFTGPUBase : public FFTBase {
 
     constexpr bool kInPlaceFft = false;
     const auto kFftType =
-        IsReal() ? (IsForward() ? gpu::fft::Type::kR2C : gpu::fft::Type::kC2R)
-                 : (IsForward() ? gpu::fft::Type::kC2CForward
-                                : gpu::fft::Type::kC2CInverse);
+        IsReal() ? (IsForward() ? se::fft::Type::kR2C : se::fft::Type::kC2R)
+                 : (IsForward() ? se::fft::Type::kC2CForward
+                                : se::fft::Type::kC2CInverse);
 
     CufftScratchAllocator scratch_allocator(CufftScratchSize, ctx);
     auto plan =
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 9b4dca8511..f99dd643f7 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -251,7 +251,7 @@ struct FusedBatchNorm<GPUDevice, T, U> {
     Tensor x_maybe_transformed = x;
     Tensor x_transformed;
     Tensor y_transformed;
-    perftools::gputools::DeviceMemory<T> y_ptr;
+    se::DeviceMemory<T> y_ptr;
 
     if (tensor_format == FORMAT_NCHW) {
       y_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*y);
@@ -279,19 +279,19 @@ struct FusedBatchNorm<GPUDevice, T, U> {
       return;
     }
 
-    perftools::gputools::dnn::BatchDescriptor x_desc;
+    se::dnn::BatchDescriptor x_desc;
     x_desc.set_count(batch_size)
         .set_feature_map_count(channels)
         .set_height(height)
         .set_width(width)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
-    perftools::gputools::dnn::BatchDescriptor scale_offset_desc;
+    se::dnn::BatchDescriptor scale_offset_desc;
     scale_offset_desc.set_count(1)
         .set_feature_map_count(channels)
         .set_height(1)
         .set_width(1)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
     auto x_ptr = StreamExecutorUtil::AsDeviceMemory<T>(x_maybe_transformed);
     auto scale_ptr = StreamExecutorUtil::AsDeviceMemory<U>(scale);
@@ -308,7 +308,7 @@ struct FusedBatchNorm<GPUDevice, T, U> {
         StreamExecutorUtil::AsDeviceMemory<U>(*saved_inv_var);
 
     GPUDevice d = context->eigen_device<GPUDevice>();
-    using perftools::gputools::DeviceMemory;
+    using se::DeviceMemory;
     Tensor inv_var;
     OP_REQUIRES_OK(
         context, context->allocate_temp(DataTypeToEnum<U>::value,
@@ -390,7 +390,7 @@ struct FusedBatchNormGrad<GPUDevice, T, U> {
 
     // Outputs
     Tensor x_backprop_transformed;
-    perftools::gputools::DeviceMemory<T> x_backprop_ptr;
+    se::DeviceMemory<T> x_backprop_ptr;
 
     if (tensor_format == FORMAT_NCHW) {
       x_backprop_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*x_backprop);
@@ -433,19 +433,19 @@ struct FusedBatchNormGrad<GPUDevice, T, U> {
       return;
     }
 
-    perftools::gputools::dnn::BatchDescriptor x_desc;
+    se::dnn::BatchDescriptor x_desc;
     x_desc.set_count(batch_size)
         .set_feature_map_count(channels)
         .set_height(height)
         .set_width(width)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
-    perftools::gputools::dnn::BatchDescriptor scale_offset_desc;
+    se::dnn::BatchDescriptor scale_offset_desc;
     scale_offset_desc.set_count(1)
         .set_feature_map_count(channels)
         .set_height(1)
         .set_width(1)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
     auto y_backprop_ptr =
         StreamExecutorUtil::AsDeviceMemory<T>(y_backprop_maybe_transformed);
diff --git a/tensorflow/core/kernels/gpu_utils.h b/tensorflow/core/kernels/gpu_utils.h
index ffc733e6bb..2f64619afc 100644
--- a/tensorflow/core/kernels/gpu_utils.h
+++ b/tensorflow/core/kernels/gpu_utils.h
@@ -29,11 +29,9 @@ limitations under the License.
 namespace tensorflow {
 
 template <typename T>
-inline perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory,
-                                                           uint64 size) {
-  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory),
-                                                size * sizeof(T));
-  perftools::gputools::DeviceMemory<T> typed(wrapped);
+inline se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
+  se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), size * sizeof(T));
+  se::DeviceMemory<T> typed(wrapped);
   return typed;
 }
 
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index c3a59c9576..b4252eb044 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -187,14 +187,14 @@ struct LaunchLRN<GPUDevice, T> {
     const int cols = static_cast<int>(in.dim_size(2));
     const int depth = static_cast<int>(in.dim_size(3));
 
-    perftools::gputools::dnn::BatchDescriptor dimensions_desc;
+    se::dnn::BatchDescriptor dimensions_desc;
     dimensions_desc.set_count(batch)
         .set_height(rows)
         .set_width(cols)
         .set_feature_map_count(depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
+        .set_layout(se::dnn::DataLayout::kBatchYXDepth);
 
-    perftools::gputools::dnn::NormalizeDescriptor normalize_desc;
+    se::dnn::NormalizeDescriptor normalize_desc;
     normalize_desc.set_bias(bias_)
         .set_range(depth_radius_)
         .set_alpha(alpha_)
@@ -404,14 +404,14 @@ struct LaunchLRNGrad<GPUDevice, T> {
     const int64 cols = in_grads.dim_size(2);
     const int64 depth = in_grads.dim_size(3);
 
-    perftools::gputools::dnn::BatchDescriptor dimensions_desc;
+    se::dnn::BatchDescriptor dimensions_desc;
     dimensions_desc.set_count(batch)
         .set_height(rows)
         .set_width(cols)
         .set_feature_map_count(depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
+        .set_layout(se::dnn::DataLayout::kBatchYXDepth);
 
-    perftools::gputools::dnn::NormalizeDescriptor normalize_desc;
+    se::dnn::NormalizeDescriptor normalize_desc;
     normalize_desc.set_bias(bias_)
         .set_range(depth_radius_)
         .set_alpha(alpha_)
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index f499ce6519..3664f95c3b 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -112,7 +112,7 @@ bool ExplicitVectorMatrixOptimization<Eigen::half>(
 template <typename Device, typename T>
 struct LaunchMatMulBase {
 #if GOOGLE_CUDA
-  typedef perftools::gputools::blas::AlgorithmType AlgorithmType;
+  typedef se::blas::AlgorithmType AlgorithmType;
 #else
   typedef int64 AlgorithmType;
 #endif  // GOOGLE_CUDA
@@ -160,15 +160,12 @@ namespace {
 
 template <typename T>
 struct LaunchBlasGemv {
-  static void Compute(
-      OpKernelContext* ctx, perftools::gputools::Stream* stream, bool trans,
-      uint64 m, uint64 n, const perftools::gputools::DeviceMemory<T>& a,
-      const perftools::gputools::DeviceMemory<T>& b,
-      perftools::gputools::DeviceMemory<T>* c,
-      perftools::gputools::blas::ProfileResult* output_profile) {
-    const auto blas_trans =
-        trans ? perftools::gputools::blas::Transpose::kTranspose
-              : perftools::gputools::blas::Transpose::kNoTranspose;
+  static void Compute(OpKernelContext* ctx, se::Stream* stream, bool trans,
+                      uint64 m, uint64 n, const se::DeviceMemory<T>& a,
+                      const se::DeviceMemory<T>& b, se::DeviceMemory<T>* c,
+                      se::blas::ProfileResult* output_profile) {
+    const auto blas_trans = trans ? se::blas::Transpose::kTranspose
+                                  : se::blas::Transpose::kNoTranspose;
     if (output_profile == nullptr) {
       bool blas_launch_status =
           stream
@@ -198,11 +195,10 @@ struct LaunchBlasGemv {
 
 template <>
 void LaunchBlasGemv<Eigen::half>::Compute(
-    OpKernelContext* ctx, perftools::gputools::Stream* stream, bool trans,
-    uint64 m, uint64 n, const perftools::gputools::DeviceMemory<Eigen::half>& a,
-    const perftools::gputools::DeviceMemory<Eigen::half>& b,
-    perftools::gputools::DeviceMemory<Eigen::half>* c,
-    perftools::gputools::blas::ProfileResult* output_profile) {
+    OpKernelContext* ctx, se::Stream* stream, bool trans, uint64 m, uint64 n,
+    const se::DeviceMemory<Eigen::half>& a,
+    const se::DeviceMemory<Eigen::half>& b, se::DeviceMemory<Eigen::half>* c,
+    se::blas::ProfileResult* output_profile) {
   ctx->SetStatus(errors::Internal(
       "Blas GEMV launch failed: GEMV is not implemented for float16."));
 }
@@ -219,10 +215,9 @@ bool ShouldUseGemv(uint64 n) {
 
 }  // namespace
 
-bool GetCublasAutotuneComputationType(
-    const DataType& dtype,
-    perftools::gputools::blas::ComputationType* compute_type) {
-  using perftools::gputools::blas::ComputationType;
+bool GetCublasAutotuneComputationType(const DataType& dtype,
+                                      se::blas::ComputationType* compute_type) {
+  using se::blas::ComputationType;
   bool use_f32_for_f16_computation = MatmulDoFP32ComputationFP16Input();
   switch (dtype) {
     case DT_HALF:
@@ -250,7 +245,7 @@ struct MatmulAutoTuneGroup {
   static string name() { return "Matmul"; }
 };
 typedef AutoTuneSingleton<MatmulAutoTuneGroup, MatmulParameters,
-                          perftools::gputools::blas::AlgorithmConfig>
+                          se::blas::AlgorithmConfig>
     AutoTuneMatmul;
 
 template <typename T>
@@ -259,14 +254,14 @@ struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> {
       OpKernelContext* ctx, const Tensor& a, const Tensor& b,
       const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
       std::vector<int64>* algorithms, bool use_autotune, Tensor* out) {
-    using perftools::gputools::blas::AlgorithmConfig;
-    using perftools::gputools::blas::ComputationType;
-    using perftools::gputools::blas::kDefaultAlgorithm;
-    using perftools::gputools::blas::kDefaultBlasGemm;
-    using perftools::gputools::blas::kDefaultBlasGemv;
-    using perftools::gputools::blas::kNoAlgorithm;
-    using perftools::gputools::blas::ProfileResult;
-    using perftools::gputools::blas::Transpose;
+    using se::blas::AlgorithmConfig;
+    using se::blas::ComputationType;
+    using se::blas::kDefaultAlgorithm;
+    using se::blas::kDefaultBlasGemm;
+    using se::blas::kDefaultBlasGemv;
+    using se::blas::kNoAlgorithm;
+    using se::blas::ProfileResult;
+    using se::blas::Transpose;
     Transpose trans[] = {Transpose::kNoTranspose, Transpose::kTranspose};
     const uint64 m = a.dim_size(1 - dim_pair[0].first);
     const uint64 k = a.dim_size(dim_pair[0].first);
diff --git a/tensorflow/core/kernels/matrix_triangular_solve_op.cc b/tensorflow/core/kernels/matrix_triangular_solve_op.cc
index 6f7e6a7496..5de0d1118a 100644
--- a/tensorflow/core/kernels/matrix_triangular_solve_op.cc
+++ b/tensorflow/core/kernels/matrix_triangular_solve_op.cc
@@ -34,11 +34,9 @@ namespace tensorflow {
 #if GOOGLE_CUDA
 namespace {
 template <typename Scalar>
-perftools::gputools::DeviceMemory<Scalar> AsDeviceMemory(
-    const Scalar* cuda_memory) {
-  perftools::gputools::DeviceMemoryBase wrapped(
-      const_cast<Scalar*>(cuda_memory));
-  perftools::gputools::DeviceMemory<Scalar> typed(wrapped);
+se::DeviceMemory<Scalar> AsDeviceMemory(const Scalar* cuda_memory) {
+  se::DeviceMemoryBase wrapped(const_cast<Scalar*>(cuda_memory));
+  se::DeviceMemory<Scalar> typed(wrapped);
   return typed;
 }
 }  // namespace
@@ -204,18 +202,17 @@ class MatrixTriangularSolveOpGPU : public LinearAlgebraOp<Scalar> {
     // output' = rhs' / matrix' (' stands for transpose)
     // Upper/lower needs to be swapped for this.
 
-    perftools::gputools::blas::UpperLower upper_lower_matrix;
-    perftools::gputools::blas::Transpose transpose_matrix;
+    se::blas::UpperLower upper_lower_matrix;
+    se::blas::Transpose transpose_matrix;
     if (lower_) {
-      upper_lower_matrix = perftools::gputools::blas::UpperLower::kUpper;
+      upper_lower_matrix = se::blas::UpperLower::kUpper;
     } else {
-      upper_lower_matrix = perftools::gputools::blas::UpperLower::kLower;
+      upper_lower_matrix = se::blas::UpperLower::kLower;
     }
     if (adjoint_) {
-      transpose_matrix =
-          perftools::gputools::blas::Transpose::kConjugateTranspose;
+      transpose_matrix = se::blas::Transpose::kConjugateTranspose;
     } else {
-      transpose_matrix = perftools::gputools::blas::Transpose::kNoTranspose;
+      transpose_matrix = se::blas::Transpose::kNoTranspose;
     }
     uint64 leading_dim_matrix = matrix.cols();
     uint64 leading_dim_output = output.cols();
@@ -224,11 +221,11 @@ class MatrixTriangularSolveOpGPU : public LinearAlgebraOp<Scalar> {
     bool blas_launch_status =
         stream
             ->ThenBlasTrsm(
-                perftools::gputools::blas::Side::kRight /*side*/,
-                upper_lower_matrix /*uplo*/, transpose_matrix /*trans*/,
-                perftools::gputools::blas::Diagonal::kNonUnit /*diag*/,
-                colmajor_rows /*m*/, colmajor_cols /*n*/, Scalar(1.0) /*alpha*/,
-                matrix_ptr, leading_dim_matrix /*lda*/, &out_ptr,
+                se::blas::Side::kRight /*side*/, upper_lower_matrix /*uplo*/,
+                transpose_matrix /*trans*/,
+                se::blas::Diagonal::kNonUnit /*diag*/, colmajor_rows /*m*/,
+                colmajor_cols /*n*/, Scalar(1.0) /*alpha*/, matrix_ptr,
+                leading_dim_matrix /*lda*/, &out_ptr,
                 leading_dim_output /*ldb*/)
             .ok();
     if (!blas_launch_status) {
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index aaaf45d3e7..507fc99837 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -404,10 +404,10 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
                     "Pooling is not yet supported on the batch dimension."));
 
     if (use_dnn_) {
-      DnnPoolingGradOp<T>::Compute(
-          context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize,
-          stride, padding_, data_format_, &tensor_in, &tensor_out, out_backprop,
-          output_shape, propagate_nans_);
+      DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum,
+                                   ksize, stride, padding_, data_format_,
+                                   &tensor_in, &tensor_out, out_backprop,
+                                   output_shape, propagate_nans_);
     } else {
       CHECK(data_format_ == FORMAT_NHWC)
           << "Non-Cudnn MaxPoolGrad only supports NHWC format";
@@ -1136,10 +1136,9 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
 
     // These is_int8x4 checks avoid linker errors for missing qint8 kernels.
     if (!is_int8x4 && use_dnn_ && data_format_ == FORMAT_NCHW) {
-      DnnPoolingOp<T>::Compute(context,
-                               perftools::gputools::dnn::PoolingMode::kMaximum,
-                               ksize_, stride_, padding_, data_format_,
-                               tensor_in, out_shape, propagate_nans_);
+      DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
+                               stride_, padding_, data_format_, tensor_in,
+                               out_shape, propagate_nans_);
     } else {
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
@@ -1240,9 +1239,8 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
         ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
                         params.out_width, params.depth);
     if (use_dnn_ && data_format_ == FORMAT_NCHW) {
-      DnnPoolingOp<T>::Compute(context,
-                               perftools::gputools::dnn::PoolingMode::kMaximum,
-                               ksize, stride, padding_, data_format_, tensor_in,
+      DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize,
+                               stride, padding_, data_format_, tensor_in,
                                out_shape, propagate_nans_);
     } else {
       CHECK(data_format_ == FORMAT_NHWC)
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index 01bcfede1e..2180c4eb97 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -748,9 +748,8 @@ struct LaunchPoolingOp<GPUDevice, T, AVG> {
                      const std::array<int64, 3>& padding,
                      TensorFormat data_format, Padding padding_type,
                      Tensor* output) {
-    DnnPooling3dOp<T>::Compute(
-        context, perftools::gputools::dnn::PoolingMode::kAverage, window,
-        stride, padding, data_format, tensor_in, output);
+    DnnPooling3dOp<T>::Compute(context, se::dnn::PoolingMode::kAverage, window,
+                               stride, padding, data_format, tensor_in, output);
   }
 };
 
@@ -762,9 +761,8 @@ struct LaunchPoolingOp<GPUDevice, T, MAX> {
                      const std::array<int64, 3>& padding,
                      TensorFormat data_format, Padding padding_type,
                      Tensor* output) {
-    DnnPooling3dOp<T>::Compute(
-        context, perftools::gputools::dnn::PoolingMode::kMaximum, window,
-        stride, padding, data_format, tensor_in, output);
+    DnnPooling3dOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, window,
+                               stride, padding, data_format, tensor_in, output);
   }
 };
 
@@ -778,10 +776,10 @@ struct LaunchMaxPooling3dGradOp<GPUDevice, T> {
                      const std::array<int64, 3>& padding,
                      TensorFormat data_format, Tensor* input_backprop) {
     const TensorShape output_shape = tensor_in.shape();
-    DnnPooling3dGradOp<T>::Compute(
-        context, perftools::gputools::dnn::PoolingMode::kMaximum, window,
-        stride, padding, out, data_format, out_backprop, output_shape,
-        &tensor_in, &tensor_out, input_backprop);
+    DnnPooling3dGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum,
+                                   window, stride, padding, out, data_format,
+                                   out_backprop, output_shape, &tensor_in,
+                                   &tensor_out, input_backprop);
   }
 };
 
@@ -796,9 +794,8 @@ struct LaunchAvgPooling3dGradOp<GPUDevice, T> {
                      const std::array<int64, 3>& padding,
                      TensorFormat data_format, Tensor* output) {
     DnnPooling3dGradOp<T>::Compute(
-        context, perftools::gputools::dnn::PoolingMode::kAverage, window,
-        stride, padding, out, data_format, out_backprop, tensor_in_shape,
-        nullptr, nullptr, output);
+        context, se::dnn::PoolingMode::kAverage, window, stride, padding, out,
+        data_format, out_backprop, tensor_in_shape, nullptr, nullptr, output);
   }
 };
 
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index d4241b5809..e583f7feb4 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -114,11 +114,9 @@ TensorShape PoolParameters::forward_output_shape() {
 
 namespace {
 template <typename T>
-perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory,
-                                                    uint64 size) {
-  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory),
-                                                size * sizeof(T));
-  perftools::gputools::DeviceMemory<T> typed(wrapped);
+se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
+  se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), size * sizeof(T));
+  se::DeviceMemory<T> typed(wrapped);
   return typed;
 }
 }  // namespace
@@ -138,12 +136,13 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC)
 }  // namespace functor
 
 template <typename T>
-void DnnPoolingOp<T>::Compute(
-    OpKernelContext* context,
-    perftools::gputools::dnn::PoolingMode pooling_mode,
-    const std::vector<int32>& size, const std::vector<int32>& stride,
-    Padding padding, TensorFormat data_format, const Tensor& tensor_in,
-    const TensorShape& tensor_out_shape, bool propagate_nans) {
+void DnnPoolingOp<T>::Compute(OpKernelContext* context,
+                              se::dnn::PoolingMode pooling_mode,
+                              const std::vector<int32>& size,
+                              const std::vector<int32>& stride, Padding padding,
+                              TensorFormat data_format, const Tensor& tensor_in,
+                              const TensorShape& tensor_out_shape,
+                              bool propagate_nans) {
   Tensor* tensor_out = nullptr;
   OP_REQUIRES_OK(context,
                  context->allocate_output(0, tensor_out_shape, &tensor_out));
@@ -184,7 +183,7 @@ void DnnPoolingOp<T>::Compute(
   }
 
   /// Get ready to call cudnn
-  perftools::gputools::dnn::PoolingDescriptor pooling_desc;
+  se::dnn::PoolingDescriptor pooling_desc;
   pooling_desc.set_pooling_mode(pooling_mode)
       .set_window_height(params.window_rows)
       .set_window_width(params.window_cols)
@@ -194,19 +193,19 @@ void DnnPoolingOp<T>::Compute(
       .set_horizontal_padding(params.pad_cols)
       .set_propagate_nans(propagate_nans);
 
-  perftools::gputools::dnn::BatchDescriptor input_desc;
+  se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(params.tensor_in_batch)
       .set_height(params.tensor_in_rows)
       .set_width(params.tensor_in_cols)
       .set_feature_map_count(params.depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
-  perftools::gputools::dnn::BatchDescriptor output_desc;
+  se::dnn::BatchDescriptor output_desc;
   output_desc.set_count(params.tensor_in_batch)
       .set_height(params.out_height)
       .set_width(params.out_width)
       .set_feature_map_count(params.depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
   auto input_data = AsDeviceMemory(transformed_input.template flat<T>().data(),
                                    transformed_input.template flat<T>().size());
@@ -236,13 +235,12 @@ void DnnPoolingOp<T>::Compute(
 
 template <typename T>
 void DnnPoolingGradOp<T>::Compute(
-    OpKernelContext* context,
-    perftools::gputools::dnn::PoolingMode pooling_mode,
+    OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
     const std::vector<int32>& size, const std::vector<int32>& stride,
     Padding padding, TensorFormat data_format, const Tensor* tensor_in,
     const Tensor* tensor_out, const Tensor& out_backprop,
     const TensorShape& tensor_in_shape, bool propagate_nans) {
-  CHECK((pooling_mode != perftools::gputools::dnn::PoolingMode::kMaximum) ||
+  CHECK((pooling_mode != se::dnn::PoolingMode::kMaximum) ||
         (tensor_in && tensor_out))
       << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
          "specified";
@@ -327,7 +325,7 @@ void DnnPoolingGradOp<T>::Compute(
   }
 
   /// Get ready to call cudnn
-  perftools::gputools::dnn::PoolingDescriptor pooling_desc;
+  se::dnn::PoolingDescriptor pooling_desc;
   pooling_desc.set_pooling_mode(pooling_mode)
       .set_window_height(params.window_rows)
       .set_window_width(params.window_cols)
@@ -337,19 +335,19 @@ void DnnPoolingGradOp<T>::Compute(
       .set_horizontal_padding(params.pad_cols)
       .set_propagate_nans(propagate_nans);
 
-  perftools::gputools::dnn::BatchDescriptor orig_output_desc;
+  se::dnn::BatchDescriptor orig_output_desc;
   orig_output_desc.set_count(params.tensor_in_batch)
       .set_height(params.out_height)
       .set_width(params.out_width)
       .set_feature_map_count(params.depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
-  perftools::gputools::dnn::BatchDescriptor orig_input_desc;
+  se::dnn::BatchDescriptor orig_input_desc;
   orig_input_desc.set_count(params.tensor_in_batch)
       .set_height(params.tensor_in_rows)
       .set_width(params.tensor_in_cols)
       .set_feature_map_count(params.depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
   auto orig_output_data =
       AsDeviceMemory(transformed_output.template flat<T>().data(),
diff --git a/tensorflow/core/kernels/pooling_ops_common_gpu.h b/tensorflow/core/kernels/pooling_ops_common_gpu.h
index 1458456585..7362c5275f 100644
--- a/tensorflow/core/kernels/pooling_ops_common_gpu.h
+++ b/tensorflow/core/kernels/pooling_ops_common_gpu.h
@@ -40,7 +40,7 @@ class DnnPoolingOp {
  public:
   typedef GPUDevice Device;
   static void Compute(OpKernelContext* context,
-                      perftools::gputools::dnn::PoolingMode pooling_mode,
+                      se::dnn::PoolingMode pooling_mode,
                       const std::vector<int32>& size,
                       const std::vector<int32>& stride, Padding padding,
                       TensorFormat data_format, const Tensor& tensor_in,
@@ -55,7 +55,7 @@ class DnnPoolingGradOp {
  public:
   typedef GPUDevice Device;
   static void Compute(OpKernelContext* context,
-                      perftools::gputools::dnn::PoolingMode pooling_mode,
+                      se::dnn::PoolingMode pooling_mode,
                       const std::vector<int32>& size,
                       const std::vector<int32>& stride, Padding padding,
                       TensorFormat data_format, const Tensor* tensor_in,
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index 2fc73a3309..c87ce78e05 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -40,7 +40,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/platform/cuda.h"
 
-using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
+using stream_executor::cuda::ScopedActivateExecutorContext;
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
@@ -242,7 +242,7 @@ class SegmentSumGPUOp : public AsyncOpKernel {
       return;
     }
 
-    perftools::gputools::DeviceMemoryBase output_rows_device(
+    se::DeviceMemoryBase output_rows_device(
         const_cast<Tensor&>(segment_ids).template flat<Index>().data() +
         (num_indices - 1));
     ScratchSpace<Index> output_rows_host(context, 1, /* on_host */ true);
diff --git a/tensorflow/core/kernels/where_op.cc b/tensorflow/core/kernels/where_op.cc
index f92c4ed17a..3330442ffd 100644
--- a/tensorflow/core/kernels/where_op.cc
+++ b/tensorflow/core/kernels/where_op.cc
@@ -42,7 +42,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/platform/cuda.h"
 
-using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
+using stream_executor::cuda::ScopedActivateExecutorContext;
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
@@ -278,8 +278,7 @@ class WhereGPUOp : public AsyncOpKernel {
 
     auto num_true_t = num_true.scalar<Tindex>();
 
-    perftools::gputools::DeviceMemoryBase num_true_ptr(
-        static_cast<void*>(num_true_t.data()));
+    se::DeviceMemoryBase num_true_ptr(static_cast<void*>(num_true_t.data()));
     // Push kernel to stream to get number of true elements.
     const GPUDevice& d = context->eigen_device<GPUDevice>();
     Status s = functor::NumTrue<GPUDevice, T, Tindex>::Compute(
diff --git a/tensorflow/core/platform/default/gpu/cupti_wrapper.cc b/tensorflow/core/platform/default/gpu/cupti_wrapper.cc
index 580db4844f..7ac5e5c445 100644
--- a/tensorflow/core/platform/default/gpu/cupti_wrapper.cc
+++ b/tensorflow/core/platform/default/gpu/cupti_wrapper.cc
@@ -28,27 +28,27 @@ namespace profiler {
 
 namespace dynload {
 
-#define LIBCUPTI_WRAP(__name)                                               \
-  struct DynLoadShim__##__name {                                            \
-    static const char* kName;                                               \
-    using FuncPointerT = std::add_pointer<decltype(::__name)>::type;        \
-    static void* GetDsoHandle() {                                           \
-      static auto status = perftools::gputools::internal::CachedDsoLoader:: \
-          GetLibcuptiDsoHandle();                                           \
-      return status.ValueOrDie();                                           \
-    }                                                                       \
-    static FuncPointerT DynLoad() {                                         \
-      static void* f;                                                       \
-      TF_CHECK_OK(::tensorflow::Env::Default()->GetSymbolFromLibrary(       \
-          GetDsoHandle(), kName, &f))                                       \
-          << "could not find " << kName << "in libcupti DSO";               \
-      return reinterpret_cast<FuncPointerT>(f);                             \
-    }                                                                       \
-    template <typename... Args>                                             \
-    CUptiResult operator()(Args... args) {                                  \
-      return DynLoad()(args...);                                            \
-    }                                                                       \
-  } __name;                                                                 \
+#define LIBCUPTI_WRAP(__name)                                                 \
+  struct DynLoadShim__##__name {                                              \
+    static const char* kName;                                                 \
+    using FuncPointerT = std::add_pointer<decltype(::__name)>::type;          \
+    static void* GetDsoHandle() {                                             \
+      static auto status =                                                    \
+          stream_executor::internal::CachedDsoLoader::GetLibcuptiDsoHandle(); \
+      return status.ValueOrDie();                                             \
+    }                                                                         \
+    static FuncPointerT DynLoad() {                                           \
+      static void* f;                                                         \
+      TF_CHECK_OK(::tensorflow::Env::Default()->GetSymbolFromLibrary(         \
+          GetDsoHandle(), kName, &f))                                         \
+          << "could not find " << kName << "in libcupti DSO";                 \
+      return reinterpret_cast<FuncPointerT>(f);                               \
+    }                                                                         \
+    template <typename... Args>                                               \
+    CUptiResult operator()(Args... args) {                                    \
+      return DynLoad()(args...);                                              \
+    }                                                                         \
+  } __name;                                                                   \
   const char* DynLoadShim__##__name::kName = #__name;
 
 LIBCUPTI_WRAP(cuptiActivityDisable);
diff --git a/tensorflow/core/platform/types.h b/tensorflow/core/platform/types.h
index f2471712cc..68897ac423 100644
--- a/tensorflow/core/platform/types.h
+++ b/tensorflow/core/platform/types.h
@@ -63,9 +63,7 @@ typedef uint64 Fprint;
 // Alias namespace ::stream_executor as ::tensorflow::se.
 namespace stream_executor {}
 namespace tensorflow {
-// TODO(b/77980417): Uncomment this once all namespace aliases named 'se' are
-// removed in ::xla.
-// namespace se = ::stream_executor;
+namespace se = ::stream_executor;
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PLATFORM_TYPES_H_
-- 
GitLab


From 7bee86727b87a8317d4f1407061edfa9ccb16ea5 Mon Sep 17 00:00:00 2001
From: Igor Ganichev <iga@google.com>
Date: Mon, 23 Apr 2018 19:35:12 -0700
Subject: [PATCH 339/450] Don't Ref() XlaDeviceContext unnecessarily.

PiperOrigin-RevId: 194024407
---
 tensorflow/compiler/jit/xla_device.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 7beb18c04d..3e27cd39c6 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -234,7 +234,6 @@ Status XlaDevice::CreateAndSetGpuDeviceInfo() {
     gpu_device_info_->stream = stream;
     gpu_device_info_->default_context =
         new XlaDeviceContext(stream, client(), transfer_as_literal_);
-    gpu_device_info_->default_context->Ref();
     set_tensorflow_gpu_device_info(gpu_device_info_.get());
   }
 
-- 
GitLab


From 3f7c9265b59cae306d029dfac76e25badd20def8 Mon Sep 17 00:00:00 2001
From: Sung Jin Hwang <sjhwang@google.com>
Date: Mon, 23 Apr 2018 19:35:19 -0700
Subject: [PATCH 340/450] Add missing pmf_to_cdf_op.cc in the source list in
 cmake.

Also split range_coder_ops.cc and range_coder_ops_util.cc into separate targets
so that dependence to range_coder_ops_util.cc does not register kernels again.

PiperOrigin-RevId: 194024410
---
 tensorflow/contrib/coder/BUILD | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/coder/BUILD b/tensorflow/contrib/coder/BUILD
index a146460a9c..a2c6e41303 100644
--- a/tensorflow/contrib/coder/BUILD
+++ b/tensorflow/contrib/coder/BUILD
@@ -54,19 +54,27 @@ tf_gen_op_libs(
     ],
 )
 
+cc_library(
+    name = "range_coder_ops_util",
+    srcs = ["kernels/range_coder_ops_util.cc"],
+    hdrs = ["kernels/range_coder_ops_util.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
 tf_kernel_library(
     name = "range_coder_ops",
     srcs = [
         "kernels/range_coder_ops.cc",
-        "kernels/range_coder_ops_util.cc",
-    ],
-    hdrs = [
-        "kernels/range_coder_ops_util.h",
     ],
     visibility = ["//visibility:public"],
     deps = [
         ":coder_ops_op_lib",
         ":range_coder",
+        ":range_coder_ops_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
-- 
GitLab


From 24b7c9a800ab5086d45a7d83ebcd6218424dc9e3 Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou <yuefengz@google.com>
Date: Mon, 23 Apr 2018 20:15:30 -0700
Subject: [PATCH 341/450] Make all_reduce._split_by_task function able to deal
 with different jobs.

PiperOrigin-RevId: 194027134
---
 .../contrib/all_reduce/python/all_reduce.py   | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tensorflow/contrib/all_reduce/python/all_reduce.py b/tensorflow/contrib/all_reduce/python/all_reduce.py
index 8add2aacff..159d985db5 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce.py
+++ b/tensorflow/contrib/all_reduce/python/all_reduce.py
@@ -18,10 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import math
-import re
 
 from tensorflow.contrib import nccl
+from tensorflow.python.framework import device as device_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -659,21 +660,20 @@ def _split_by_task(devices, values):
   num_devices = len(devices)
   if num_devices != len(values):
     raise ValueError("len(devices) must equal len(values)")
-  pattern = re.compile(r"/task:(\d+)/")
-  per_task_devices = []
-  per_task_values = []
+  per_task_devices = collections.OrderedDict()
+  per_task_values = collections.OrderedDict()
   for d in range(num_devices):
-    m = pattern.search(devices[d])
-    if m:
-      index = int(m.group(1))
-      while index >= len(per_task_devices):
-        per_task_devices.append([])
-        per_task_values.append([])
-      per_task_devices[index].append(devices[d])
-      per_task_values[index].append(values[d])
-    else:
+    d_spec = device_lib.DeviceSpec.from_string(devices[d])
+    if not hasattr(d_spec, "task") or d_spec.task is None:
       assert False, "failed to parse device %s" % devices[d]
-  return (per_task_devices, per_task_values)
+    index = (d_spec.job or "localhost", d_spec.replica or 0, d_spec.task)
+    if index not in per_task_devices:
+      per_task_devices[index] = []
+      per_task_values[index] = []
+    per_task_devices[index].append(devices[d])
+    per_task_values[index].append(values[d])
+
+  return (list(per_task_devices.values()), list(per_task_values.values()))
 
 
 def build_nccl_all_reduce(input_tensors, red_op, un_op=None):
-- 
GitLab


From 22f3a97b8b089202f60bb0c7697feb0c8e0713cc Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Mon, 23 Apr 2018 21:19:14 -0700
Subject: [PATCH 342/450] Merge changes from github.

PiperOrigin-RevId: 194031845
---
 CODEOWNERS                                    |   2 +-
 README.md                                     |   2 +-
 RELEASE.md                                    |  58 +++
 WORKSPACE                                     |   8 +-
 tensorflow/c/c_api.h                          |   4 +-
 tensorflow/c/c_api_experimental.cc            |  12 +
 tensorflow/c/c_api_experimental.h             |   4 +-
 tensorflow/c/eager/c_api.h                    |   4 +-
 tensorflow/compiler/aot/runtime.cc            |   4 +-
 tensorflow/compiler/tests/binary_ops_test.py  |  12 +-
 .../compiler/xla/python/xla_client_test.py    |   1 -
 .../gpu/cudnn_convolution_algorithm_picker.cc |   4 +-
 .../compiler/xla/tests/dot_operation_test.cc  |   7 +
 .../autograph/converters/call_trees.py        |   2 +-
 .../autograph/converters/call_trees_test.py   |   2 +-
 .../autograph/converters/decorators_test.py   |   2 +-
 tensorflow/contrib/autograph/impl/api.py      |   4 +-
 .../contrib/autograph/impl/conversion.py      |   2 +-
 .../pyct/static_analysis/activity.py          |   6 +-
 .../pyct/static_analysis/activity_test.py     |   2 +-
 .../autograph/pyct/static_analysis/annos.py   |   8 +-
 .../contrib/autograph/utils/builtins.py       |   2 +-
 .../bayesflow/python/ops/monte_carlo_impl.py  |  26 +-
 .../training/functions/gbdt_batch_test.py     |   2 +-
 .../python/split_dependency_test.py           |   2 +-
 tensorflow/contrib/cmake/CMakeLists.txt       |  70 +++-
 tensorflow/contrib/cmake/README.md            |  28 ++
 .../contrib/cmake/external/gemmlowp.cmake     |   4 +-
 .../contrib/cmake/external/mkldnn.cmake       |  44 +++
 tensorflow/contrib/cmake/external/png.cmake   |  19 +-
 .../contrib/cmake/external/sqlite.cmake       |   4 +-
 .../contrib/cmake/tf_core_framework.cmake     |   8 +-
 tensorflow/contrib/cmake/tf_python.cmake      |   9 +-
 tensorflow/contrib/cmake/tf_shared_lib.cmake  |   3 +-
 .../contrib/cmake/tf_stream_executor.cmake    |   6 +
 .../contrib/cmake/tools/create_def_file.py    |   8 +-
 .../crf/python/kernel_tests/crf_test.py       |  15 +
 tensorflow/contrib/crf/python/ops/crf.py      |   8 +-
 .../cudnn_rnn/python/layers/cudnn_rnn.py      |   3 +-
 .../contrib/data/python/kernel_tests/BUILD    |   7 +-
 .../dataset_serialization_test_base.py        |   2 +-
 .../interleave_dataset_op_test.py             |  63 ++--
 .../kernel_tests/stats_dataset_ops_test.py    |  16 +
 .../contrib/data/python/ops/interleave_ops.py |  26 +-
 .../data/python/ops/prefetching_ops.py        |   6 +-
 .../contrib/data/python/ops/scan_ops.py       |   2 +-
 .../python/kernel_tests/shape_test.py         |   1 -
 tensorflow/contrib/eager/python/saver_test.py |   1 -
 .../estimator/python/estimator/head.py        |   2 +-
 .../python/estimator/replicate_model_fn.py    |   4 +-
 .../factorization/python/ops/gmm_ops.py       |  12 +-
 .../factorization/python/ops/kmeans.py        |   4 +-
 tensorflow/contrib/framework/__init__.py      |   3 +-
 .../python/framework/tensor_util_test.py      |   2 +-
 .../ops/fused_conv2d_bias_activation_op.py    |   2 +-
 .../fused_conv2d_bias_activation_op_test.py   |  10 +-
 .../eval/python/sliced_wasserstein_impl.py    |   2 +-
 .../features/python/virtual_batchnorm_impl.py |   6 +-
 tensorflow/contrib/hvx/README.md              |   3 +-
 .../kernels/adjust_hsv_in_yiq_op_gpu.cu.cc    |   2 +-
 .../contrib/image/ops/distort_image_ops.cc    |   4 +-
 tensorflow/contrib/image/ops/image_ops.cc     |   2 +-
 ...single_image_random_dot_stereograms_ops.cc |   4 +-
 .../contrib/image/python/ops/image_ops.py     |   2 +-
 .../single_image_random_dot_stereograms.py    |   2 +-
 .../contrib/kfac/python/ops/loss_functions.py |   6 +-
 .../kfac/python/ops/loss_functions_lib.py     |   1 -
 .../labeled_tensor/python/ops/ops_test.py     |   4 +-
 .../sparse_feature_cross_op_test.py           |   2 +-
 .../layers/python/layers/feature_column.py    |   2 +-
 .../python/layers/feature_column_ops.py       |   4 +-
 .../contrib/layers/python/layers/layers.py    | 142 ++++++-
 .../layers/python/layers/layers_test.py       |  15 +-
 .../python/layers/rev_block_lib_test.py       |   4 +-
 .../layers/python/layers/utils_test.py        |   1 -
 .../python/learn/estimators/kmeans_test.py    |   1 -
 .../python/learn/estimators/run_config.py     |   1 +
 tensorflow/contrib/lite/Makefile              |   3 +-
 .../contrib/lite/download_dependencies.sh     |   6 +-
 .../project.pbxproj                           |   8 -
 tensorflow/contrib/lite/g3doc/apis.md         |   2 +-
 .../Camera2BasicFragment.java                 |  23 ++
 .../tflitecamerademo/ImageClassifier.java     |  10 +
 .../res/layout/fragment_camera2_basic.xml     |  41 ++-
 .../demo/app/src/main/res/values/strings.xml  |   2 +
 .../java/org/tensorflow/lite/Interpreter.java |   7 +
 .../lite/NativeInterpreterWrapper.java        |   6 +
 .../native/nativeinterpreterwrapper_jni.cc    |  10 +
 .../native/nativeinterpreterwrapper_jni.h     |  12 +-
 tensorflow/contrib/lite/kernels/add.cc        |   2 +-
 tensorflow/contrib/lite/kernels/div.cc        |   5 +-
 .../internal/optimized/optimized_ops.h        |   2 +-
 .../internal/reference/reference_ops.h        |  39 +-
 tensorflow/contrib/lite/kernels/sub.cc        |   3 +-
 .../resolve_tensorflow_merge.cc               |   2 +-
 tensorflow/contrib/lite/toco/model.h          |   6 +-
 .../contrib/losses/python/losses/loss_ops.py  |   9 +-
 .../python/metric_learning/metric_loss_ops.py |   4 +-
 .../contrib/makefile/download_dependencies.sh |   4 +-
 .../meta_graph_transform.py                   |   2 +-
 .../contrib/metrics/python/ops/metric_ops.py  |  15 +-
 .../contrib/nn/python/ops/sampling_ops.py     |   2 +-
 tensorflow/contrib/opt/BUILD                  |  17 +
 tensorflow/contrib/opt/__init__.py            |   2 +
 .../contrib/opt/python/training/adamax.py     | 191 ++++++++++
 .../opt/python/training/adamax_test.py        | 348 ++++++++++++++++++
 .../training/moving_average_optimizer_test.py |   4 +-
 .../optimizer_v2/checkpointable_utils_test.py |   2 +-
 .../contrib/optimizer_v2/optimizer_v2.py      |   2 +-
 .../quantize/python/fold_batch_norms.py       |   2 +-
 .../kernel_tests/attention_wrapper_test.py    | 112 +++++-
 .../seq2seq/python/ops/attention_wrapper.py   |  38 +-
 .../python/kernel_tests/mel_ops_test.py       |  13 +
 .../contrib/signal/python/ops/mel_ops.py      |  16 +-
 tensorflow/contrib/slim/README.md             |   8 +-
 .../contrib/slim/python/slim/learning.py      |   5 +-
 .../slim/python/slim/nets/resnet_v1.py        |   2 +-
 .../slim/python/slim/nets/resnet_v2.py        |   2 +-
 .../tensor_forest/client/random_forest.py     |   2 +-
 .../core/ops/hard_routing_function_op.cc      |   2 +-
 .../stochastic_hard_routing_function_op.cc    |   2 +-
 .../stochastic_hard_routing_gradient_op.cc    |   2 +-
 .../tensor_forest/kernels/tree_utils.cc       |   4 +-
 .../tensor_forest/kernels/tree_utils.h        |   2 +-
 .../kernels/v4/decision-tree-resource.h       |   2 +-
 .../kernels/v4/decision_node_evaluator.h      |   2 +-
 .../contrib/tensor_forest/ops/model_ops.cc    |   2 +-
 .../contrib/tensor_forest/ops/stats_ops.cc    |   6 +-
 .../tensor_forest/python/tensor_forest.py     |   2 +-
 tensorflow/contrib/tensorrt/BUILD             |  21 +-
 tensorflow/contrib/tensorrt/README.md         |  60 +--
 .../resources/trt_resource_manager.cc         |   6 +
 .../tensorrt/resources/trt_resource_manager.h |   6 +-
 .../tensorrt/test/tf_trt_integration_test.py  | 156 ++++++++
 .../python/timeseries/math_utils.py           |   2 +-
 .../training/python/training/resample.py      |   2 +-
 .../training/python/training/sampling_ops.py  |   6 +-
 .../training/sequence_queueing_state_saver.py |   4 +-
 tensorflow/core/BUILD                         |  16 +
 .../base_api/api_def_ApplyAdaMax.pbtxt        |  78 ++++
 .../base_api/api_def_BroadcastTo.pbtxt        |  41 +++
 .../base_api/api_def_ImageSummary.pbtxt       |   2 +-
 .../api_def_ResourceApplyAdaMax.pbtxt         |  72 ++++
 .../base_api/api_def_StringStrip.pbtxt        |  16 +
 .../python_api/api_def_ApplyAdaMax.pbtxt      |   4 +
 .../python_api/api_def_BroadcastTo.pbtxt      |   4 +
 .../api_def_ResourceApplyAdaMax.pbtxt         |   4 +
 .../core/common_runtime/bfc_allocator.h       |   2 +-
 .../core/common_runtime/mkl_cpu_allocator.h   |   4 +
 tensorflow/core/framework/collective.h        |   2 +-
 tensorflow/core/framework/numeric_types.h     |   4 +-
 tensorflow/core/graph/mkl_tfconversion_pass.h |   4 +
 .../grappler/clusters/single_machine_test.cc  |   9 +
 tensorflow/core/grappler/optimizers/BUILD     |   1 +
 .../optimizers/custom_graph_optimizer.h       |   4 +-
 .../custom_graph_optimizer_registry_test.cc   |   5 +-
 .../optimizers/meta_optimizer_test.cc         |   5 +-
 tensorflow/core/kernels/BUILD                 |  50 +--
 .../batching_util/shared_batch_scheduler.h    |   6 +-
 tensorflow/core/kernels/broadcast_to_op.cc    |  91 +++++
 tensorflow/core/kernels/broadcast_to_op.h     | 220 +++++++++++
 .../core/kernels/broadcast_to_op_gpu.cu.cc    |  34 ++
 tensorflow/core/kernels/conv_ops_gpu.h        |   5 +-
 tensorflow/core/kernels/ctc_decoder_ops.cc    |  34 +-
 .../core/kernels/mkl_input_conversion_op.cc   |  35 +-
 tensorflow/core/kernels/mkl_relu_op.cc        |   8 +-
 tensorflow/core/kernels/roll_op.cc            |   7 +-
 .../core/kernels/segment_reduction_ops.h      |   8 +
 tensorflow/core/kernels/string_strip_op.cc    |  53 +++
 tensorflow/core/kernels/training_ops.cc       | 150 ++++++++
 tensorflow/core/kernels/training_ops.h        |  12 +
 .../core/kernels/training_ops_gpu.cu.cc       |  30 ++
 tensorflow/core/lib/bfloat16/bfloat16.h       |   4 +-
 tensorflow/core/lib/gtl/manual_constructor.h  |   2 +-
 tensorflow/core/lib/strings/stringprintf.cc   |  10 +-
 .../core/lib/strings/stringprintf_test.cc     |   4 +-
 tensorflow/core/ops/array_ops.cc              |  52 +++
 tensorflow/core/ops/dataset_ops.cc            | 140 ++++++-
 tensorflow/core/ops/manip_ops.cc              |  13 +-
 tensorflow/core/ops/nn_ops.cc                 |   6 +
 tensorflow/core/ops/random_ops.cc             |   7 +-
 tensorflow/core/ops/string_ops.cc             |   5 +
 tensorflow/core/ops/training_ops.cc           |  51 +++
 tensorflow/core/platform/default/logging.cc   |   1 +
 .../platform/hadoop/hadoop_file_system.cc     |   2 +
 .../core/protobuf/rewriter_config.proto       |  11 +
 tensorflow/core/public/version.h              |   4 +-
 tensorflow/core/util/memmapped_file_system.cc |   2 +-
 tensorflow/core/util/memmapped_file_system.h  |   4 +-
 tensorflow/core/util/mkl_util.h               |   4 +
 .../python/contrib.bayesflow.monte_carlo.md   |  28 +-
 .../docs_src/community/documentation.md       |  52 +--
 tensorflow/docs_src/deploy/s3.md              |  81 +++-
 .../docs_src/extend/language_bindings.md      |   9 +-
 tensorflow/docs_src/install/install_c.md      |   2 +-
 tensorflow/docs_src/install/install_go.md     |   2 +-
 tensorflow/docs_src/install/install_java.md   |  24 +-
 tensorflow/docs_src/install/install_linux.md  |  58 ++-
 tensorflow/docs_src/install/install_mac.md    |  10 +-
 .../docs_src/install/install_sources.md       |   9 +-
 tensorflow/docs_src/mobile/android_build.md   |   3 +-
 .../docs_src/performance/quantization.md      |   2 +-
 .../docs_src/programmers_guide/debugger.md    |   2 +-
 .../docs_src/programmers_guide/graphs.md      |   6 +-
 .../docs_src/programmers_guide/saved_model.md |  50 +--
 .../docs_src/programmers_guide/using_tpu.md   |   4 +-
 .../docs_src/tutorials/audio_recognition.md   |   2 +-
 tensorflow/docs_src/tutorials/layers.md       |  17 +-
 .../tutorials/word2vec/word2vec_basic.py      |   2 +-
 tensorflow/go/op/wrappers.go                  |   2 +-
 .../org/tensorflow/examples/LabelImage.java   |   2 +
 tensorflow/python/BUILD                       |  19 +-
 tensorflow/python/debug/cli/readline_ui.py    |   8 +-
 .../python/debug/wrappers/grpc_wrapper.py     |  11 +-
 tensorflow/python/debug/wrappers/hooks.py     |  17 +-
 tensorflow/python/estimator/canned/head.py    |   9 +-
 tensorflow/python/estimator/estimator.py      |   5 +-
 tensorflow/python/estimator/run_config.py     |  33 +-
 .../python/estimator/run_config_test.py       |  24 +-
 .../python/feature_column/feature_column.py   |   1 -
 tensorflow/python/framework/dtypes.py         |  14 +-
 .../python/framework/graph_util_impl.py       |   2 +-
 .../python/framework/graph_util_test.py       |   2 +-
 tensorflow/python/framework/load_library.py   |   2 +-
 tensorflow/python/framework/python_op_gen.i   |   8 +-
 tensorflow/python/framework/test_util.py      |   2 +
 .../python/grappler/layout_optimizer_test.py  |  10 +-
 .../python/keras/_impl/keras/backend.py       |   4 +-
 .../keras/_impl/keras/layers/normalization.py |   4 +-
 tensorflow/python/kernel_tests/BUILD          |  26 ++
 .../kernel_tests/broadcast_to_ops_test.py     |  85 +++++
 .../kernel_tests/confusion_matrix_test.py     |   7 +-
 .../python/kernel_tests/constant_op_test.py   |   5 +
 .../kernel_tests/conv3d_transpose_test.py     |  12 +
 .../python/kernel_tests/manip_ops_test.py     |  55 ++-
 .../python/kernel_tests/norm_op_test.py       |  16 +-
 .../python/kernel_tests/py_func_test.py       |  32 ++
 .../random/multinomial_op_test.py             |   2 +-
 .../kernel_tests/random/random_ops_test.py    |  11 +
 .../kernel_tests/string_strip_op_test.py      |  56 +++
 tensorflow/python/lib/core/py_func.cc         |   3 +
 tensorflow/python/ops/array_ops.py            |  15 +-
 .../python/ops/distributions/categorical.py   |   2 +-
 tensorflow/python/ops/embedding_ops.py        |  26 +-
 tensorflow/python/ops/histogram_ops.py        |   1 -
 tensorflow/python/ops/image_ops_impl.py       |  74 ++--
 tensorflow/python/ops/init_ops.py             |  18 +-
 tensorflow/python/ops/linalg_ops.py           |  77 ++--
 tensorflow/python/ops/linalg_ops_impl.py      |  73 ++++
 tensorflow/python/ops/losses/losses_impl.py   |  23 +-
 tensorflow/python/ops/math_ops.py             |  38 +-
 tensorflow/python/ops/nn.py                   |   1 +
 tensorflow/python/ops/nn_impl.py              |  11 +-
 tensorflow/python/ops/nn_ops.py               |   8 +-
 tensorflow/python/ops/rnn_cell_impl.py        |   4 +-
 .../python/profiler/tfprof_logger_test.py     |   2 +-
 tensorflow/python/tools/saved_model_cli.py    |   3 +-
 tensorflow/python/training/saver_test.py      |   2 +-
 tensorflow/python/util/compat.py              |   7 +-
 tensorflow/stream_executor/cuda/cuda_dnn.cc   |   7 +-
 tensorflow/stream_executor/cuda/cuda_dnn.h    |   2 +-
 .../stream_executor/cuda/cuda_driver.cc       |  14 +-
 .../stream_executor/cuda/cuda_gpu_executor.cc |   2 +-
 tensorflow/stream_executor/dnn.h              |  20 +-
 tensorflow/stream_executor/platform/port.h    |   6 -
 tensorflow/tensorflow.bzl                     |   3 +-
 .../tensorflow.estimator.-run-config.pbtxt    |   6 +-
 tensorflow/tools/api/golden/tensorflow.pbtxt  |   4 +
 tensorflow/tools/ci_build/builds/pip.sh       |   4 +
 .../tools/ci_build/builds/test_user_ops.sh    |  41 ++-
 .../tools/ci_build/linux/cpu/run_mkl.sh       |   5 +-
 .../ci_build/windows/gpu/cmake/run_py.bat     |   6 +-
 tensorflow/tools/docker/Dockerfile.devel      |   2 +-
 .../tools/docker/Dockerfile.devel-cpu-mkl     |   2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu  |   2 +-
 tensorflow/tools/git/gen_git_source.py        |  56 ++-
 tensorflow/tools/git/gen_git_source.sh        |  10 +-
 .../tools/graph_transforms/transform_graph.cc |  70 +++-
 tensorflow/tools/pip_package/setup.py         |   2 +-
 tensorflow/workspace.bzl                      |   9 +-
 third_party/repo.bzl                          |   3 +-
 281 files changed, 4024 insertions(+), 895 deletions(-)
 create mode 100644 tensorflow/contrib/cmake/external/mkldnn.cmake
 create mode 100644 tensorflow/contrib/opt/python/training/adamax.py
 create mode 100644 tensorflow/contrib/opt/python/training/adamax_test.py
 create mode 100644 tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_StringStrip.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_ApplyAdaMax.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_ResourceApplyAdaMax.pbtxt
 create mode 100644 tensorflow/core/kernels/broadcast_to_op.cc
 create mode 100644 tensorflow/core/kernels/broadcast_to_op.h
 create mode 100644 tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc
 create mode 100644 tensorflow/core/kernels/string_strip_op.cc
 create mode 100644 tensorflow/python/kernel_tests/broadcast_to_ops_test.py
 create mode 100644 tensorflow/python/kernel_tests/string_strip_op_test.py
 create mode 100644 tensorflow/python/ops/linalg_ops_impl.py

diff --git a/CODEOWNERS b/CODEOWNERS
index 007a304c3e..b9f0313cc6 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -45,7 +45,7 @@
 # /tensorflow/contrib/session_bundle/ @nfiedel @sukritiramesh
 # /tensorflow/contrib/slim/ @sguada @thenbasilmanran
 # /tensorflow/contrib/stateless/ @girving
-# /tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst
+# /tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst @yupbank
 # /tensorflow/contrib/testing/ @dandelionmane
 # /tensorflow/contrib/timeseries/ @allenlavoie
 # /tensorflow/contrib/tpu/ @frankchn @saeta @jhseu
diff --git a/README.md b/README.md
index 29418dc2e9..e1a50c87e2 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ data flow graphs.  The graph nodes represent mathematical operations, while
 the graph edges represent the multidimensional data arrays (tensors) that flow
 between them.  This flexible architecture enables you to deploy computation to one
 or more CPUs or GPUs in a desktop, server, or mobile device without rewriting
-code.  TensorFlow also includes TensorBoard, a data visualization toolkit.
+code.  TensorFlow also includes [TensorBoard](https://www.tensorflow.org/programmers_guide/summaries_and_tensorboard), a data visualization toolkit.
 
 TensorFlow was originally developed by researchers and engineers
 working on the Google Brain team within Google's Machine Intelligence Research
diff --git a/RELEASE.md b/RELEASE.md
index e845953174..2717c75740 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,61 @@
+# Release 1.8.0
+
+## Major Features And Improvements
+* Can now pass `tf.contrib.distribute.MirroredStrategy()` to `tf.estimator.RunConfig()` to run an Estimator model on multiple GPUs on one machine.
+* Add `tf.contrib.data.prefetch_to_device()`, which supports prefetching to GPU memory.
+* Added Gradient Boosted Trees as pre-made Estimators: BoostedTreesClassifier, BoostedTreesRegressor.
+* Add 3rd generation pipeline config for Cloud TPUs which improves performance and usability.
+* `tf.contrib.bayesflow` is moving out to it's own repo.
+* Added `tf.contrib.{proto,rpc}` to allow generic proto parsing and RPC communication.
+
+## Bug Fixes and Other Changes
+* `tf.data`:
+  * Add `tf.contrib.data.prefetch_to_device`, which enables prefetching dataset elements to GPU memory.
+  * Add `tf.contrib.data.AUTOTUNE`, which allows the tf.data runtime to automatically tune the prefetch buffer sizes based on your system and environment.
+  * Add `tf.contrib.data.make_csv_dataset` for building datasets of CSV files.
+* Eager Execution:
+  * With eager execution Datasets can now be used as standard python iterators (`for batch in dataset:`). Both `Dataset.__iter__()` and `Dataset.make_one_shot_iterator()` can now be used to create iterators when eager execution is enabled.
+  * Automatic device placement has been enabled (i.e., use a GPU if available automatically, without requiring an explicit `with tf.device(“/gpu:0”)`) (Fixes #14133)
+  * `tf.GradientTape` has moved out of contrib.
+* `tf.keras`:
+  * Added the fashion mnist dataset.
+  * New data preprocessing functions: `image/random_brightness`, `sequence/TimeseriesGenerator`, and `text/hashing_trick`.
+* Accelerated Linear Algebra (XLA):
+  * Select and scatter in reference util and evaluator now use lexicographical order to break ties.
+* TensorFlow Debugger (tfdbg) CLI:
+  * During tensor-filter operations, allow exclusion of nodes by regular expressions.
+  * Fix spurious background colors in some text terminals.
+* `tf.contrib`:
+  * Add meta-distribution BatchReshape which reshapes batch dimensions.
+  * `tf.contrib.layers.recompute_grad` works for explicit gradient checkpointing on TPU.
+  * Add `tf.contrib.framework.argsort`.
+  * Allow `DNNBoostedTreeCombinedEstimator` to work with core versions of feature columns and losses.
+  * Add non-linear image warping ops: `tf.contrib.image.sparse_image_warp`, `tf.contrib.image.dense_image_warp`, and `tf.contrib.image.interpolate_spline`.
+  * Fix bug in `tf.contrib.opt.MultitaskOptimizerWrapper` where types of tensors were mismatched.
+* Other:
+  * Low-level graph construction now calls the TensorFlow C API. This change should be invisible to most users, but can be disabled by setting the environment variable `TF_C_API_GRAPH_CONSTRUCTION=0` in this release. Future releases will remove the ability to disable this change. Please [file a bug](https://github.com/tensorflow/tensorflow/issues/new) if you find yourself using this escape hatch.
+  * Add description of shapes and a pointer to tutorial notebook in `tf.distributions.Distribution`.
+  * Update scatter operations:
+    * Add `tf.scatter_min` and `tf.scatter_max`
+    * Extend scatter operations to work with a scalar update parameter.
+  * Move cuDNN RNN ops to core for use in TensorFlow codebase only.
+  * Add `float64` support for `Conv2d`, `Conv2dBackpropInput`, and `Conv2dBackpropFilter`.
+  * Add `float64` support for `AvgPool`/`AvgPoolGrad`.
+  * Make graph name scope thread local so that they work correctly in multi-threaded environments.
+  * Update nsync synchronization library to avoid slow primitives on Linux.
+  * Removed need to put nsync/public on C include path when building custom ops.
+  * Add `tf.image.psnr`, `tf.image.ssim`, `tf.image.ssim_multiscale`, `tf.image.image_gradients`, `tf.image.sobel_edges`.
+  * Add links to https://js.tensorflow.org.
+  * Fix non-uniformity of orthogonal matrices.
+  * Fix bug where multi-image Estimator eval summaries were not displayed correctly.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+4d55397500, Aghasy, Alan Du, Alan Lee, Alan Yee, Alex Wiltschko, Animesh Karnewar, Ankit Gupta, Anton Matosov, Aris L, Ben Barsdell, Brent Yi, Brett Koonce, Carl Thomé, cbockman, Chikanaga Tomoyuki, Chris Tava, CéDric Deltheil, Dahan Gong, Dalmo Cirne, Daniel Erenrich, David Norman, DavidNorman, Edd Wilder-James, Fanjin Zeng, Felix Abecassis, fo40225, George Sterpu, Giovanni Terlingen, Gor Baghdasaryan, Guillaume Klein, Hanchen Li, Ilya Polenov, Jakub Kolodziejczyk, Jason Sadler, Jayaram Bobba, Jerry Liu, jinghuangintel, Jiongyan Zhang (张炯衍), Joel Shor, Jong Wook Kim, Julian Eisenschlos, Karl Lessard, Krish Ravindranath, Loo Rong Jie, Lukas Geiger, Luke Iwanski, Mahmoud Abuzaina, ManHyuk, Marvin Richter, Maximilian Mitchell, Mohammad Ashraf Bhuiyan, msofka, Mustafa Kasap, Nathan Burnham, Nathan Luehr, Naveen Marri, ngc92, nio1814, Oleg Zabluda, Ou Changkun, Panos Ipeirotis, Paul Van Eck, Peter Lee, Piotr Czapla, qjivy, Rholais Lii, Rodrigo Formigone, Russell Klopfer, ryantimjohn, Sang Han, SebastiáN RamíRez, shengfuintel, Siby Jose Plathottam, Silver Chan, Stanislaw Antol, Taehoon Lee, Tarang Chugh, Ted Chang, Thomas Bastiani, Xian Xu, Xiaoming (Jason) Cui, Yan Facai (颜发才), yaox12, Yashal Shakti Kanungo, Yong Tang, Yuan (Terry) Tang, Yuxin Wu, Ziyue(Louis) Lu
+
+
 # Release 1.7.0
 
 ## Major Features And Improvements
diff --git a/WORKSPACE b/WORKSPACE
index 11c5cdb207..4ddfb9a383 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -2,11 +2,11 @@ workspace(name = "org_tensorflow")
 
 http_archive(
     name = "io_bazel_rules_closure",
-    sha256 = "6691c58a2cd30a86776dd9bb34898b041e37136f2dc7e24cadaeaf599c95c657",
-    strip_prefix = "rules_closure-08039ba8ca59f64248bb3b6ae016460fe9c9914f",
+    sha256 = "a38539c5b5c358548e75b44141b4ab637bba7c4dc02b46b1f62a96d6433f56ae",
+    strip_prefix = "rules_closure-dbb96841cc0a5fb2664c37822803b06dab20c7d1",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz",
-        "https://github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz",  # 2018-01-16
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz",
+        "https://github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz",  # 2018-04-13
     ],
 )
 
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index fe85f8ee0e..c859434745 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -72,7 +72,7 @@ limitations under the License.
 #ifdef SWIG
 #define TF_CAPI_EXPORT
 #else
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_CAPI_EXPORT __declspec(dllexport)
 #else
@@ -80,7 +80,7 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
 #endif  // SWIG
 
 #ifdef __cplusplus
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index 9678ee926f..d3916bc167 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -184,6 +184,7 @@ library {
   return std::move(functions[0]);
 }
 
+#if not defined(PLATFORM_WINDOWS)
 //  On success, returns a set of TF_Function instances encoding a dataset
 //  node stack that reads a Imagenet TFRecordFile dataset from `file_path`, and
 //  sets `dataset_name` to the created dataset name. The returned functions must
@@ -7076,7 +7077,9 @@ library {
   return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
 #endif
 }
+#endif
 
+#if not defined(PLATFORM_WINDOWS)
 //  On success, returns a set of TF_Function instances encoding a dataset
 //  node stack that reads an MNIST file dataset from `file_path`, and
 //  sets `dataset_name` to the created dataset name. The returned functions must
@@ -8221,6 +8224,7 @@ library {
   return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
 #endif
 }
+#endif
 
 // Adds the input functions to `graph`.  On success, returns the created
 // IteratorGetNext node.
@@ -8314,6 +8318,13 @@ TF_Operation* TF_MakeFakeIteratorGetNextWithDatasets(TF_Graph* graph,
 TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets(
     TF_Graph* graph, const char* file_path, int batch_size,
     unsigned char is_mnist, TF_Status* status) {
+#if defined(PLATFORM_WINDOWS)
+  // TODO(ashankar): get these functions working on Windows.
+  status->status = tensorflow::errors::Unimplemented(
+      "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
+      "is not implemented for Windows");
+  return nullptr;
+#else
   tensorflow::Status s;
 
   std::string dataset_name;
@@ -8355,4 +8366,5 @@ TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets(
           << graph->graph.ToGraphDefDebug().DebugString();
 
   return getnext_node;
+#endif
 }
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index 666342974e..88cb173cd2 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -35,7 +35,7 @@ limitations under the License.
 #ifdef SWIG
 #define TF_CAPI_EXPORT
 #else
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_CAPI_EXPORT __declspec(dllexport)
 #else
@@ -43,7 +43,7 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
 #endif  // SWIG
 
 #ifdef __cplusplus
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 15ac0f376c..ba77f3cd07 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -30,7 +30,7 @@ limitations under the License.
 #ifdef SWIG
 #define TF_CAPI_EXPORT
 #else
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_CAPI_EXPORT __declspec(dllexport)
 #else
@@ -38,7 +38,7 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
 #endif  // SWIG
 
 #ifdef __cplusplus
diff --git a/tensorflow/compiler/aot/runtime.cc b/tensorflow/compiler/aot/runtime.cc
index 5772776666..5e74079fc1 100644
--- a/tensorflow/compiler/aot/runtime.cc
+++ b/tensorflow/compiler/aot/runtime.cc
@@ -31,7 +31,7 @@ namespace {
 inline void* aligned_malloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__) || defined(OS_ANDROID) || defined(OS_CYGWIN)
   return memalign(minimum_alignment, size);
-#elif defined(COMPILER_MSVC)
+#elif defined(_WIN32)
   return _aligned_malloc(size, minimum_alignment);
 #else  // !__ANDROID__ && !OS_ANDROID && !OS_CYGWIN
   void* ptr = nullptr;
@@ -48,7 +48,7 @@ inline void* aligned_malloc(size_t size, int minimum_alignment) {
 }
 
 inline void aligned_free(void* aligned_memory) {
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
   _aligned_free(aligned_memory);
 #else
   free(aligned_memory);
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index d1d7379c0a..1e4dd32916 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -360,11 +360,13 @@ class BinaryOpsTest(XLATestCase):
           np.array([2, -1], dtype=dtype),
           expected=np.array([[[[3, 1], [5, 3]]]], dtype=dtype))
 
-    self._testBinary(
-        math_ops.add,
-        np.array([0xffffffff, 0xfffffffff, 1, 1], dtype=np.int64),
-        np.array([1, 1, 0xffffffff, 0xfffffffff], dtype=np.int64),
-        expected=np.array([1 << 32, 1 << 36, 1 << 32, 1 << 36], dtype=np.int64))
+    if np.int64 in self.numeric_types:
+      self._testBinary(
+          math_ops.add,
+          np.array([0xffffffff, 0xfffffffff, 1, 1], dtype=np.int64),
+          np.array([1, 1, 0xffffffff, 0xfffffffff], dtype=np.int64),
+          expected=np.array([1 << 32, 1 << 36, 1 << 32, 1 << 36],
+                            dtype=np.int64))
 
   def testComplexOps(self):
     for dtype in self.complex_types:
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 6fe7b242e4..c073c02040 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -1160,7 +1160,6 @@ class EmbeddedComputationsTest(LocalComputationTest):
       self._ExecuteAndCompareClose(
           c, expected=np.sum(input_array, axis=tuple(dims)))
 
-    _ReduceAndTest(0)
     _ReduceAndTest(0)
     _ReduceAndTest(0, 1)
     _ReduceAndTest(0, 2)
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
index 1790c50d4d..c4c56c5692 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -97,9 +97,9 @@ bool ShouldIncludeWinogradNonfusedAlgo(const Shape& input_shape,
                                        const ConvolutionDimensionNumbers& dnums,
                                        se::StreamExecutor* stream_exec) {
   // Skip this check for cudnn7 and newer.
-  se::port::StatusOr<std::tuple<int, int, int>> version =
+  auto version =
       stream_exec->AsDnn()->GetVersion();
-  if (version.ok() && std::get<0>(version.ValueOrDie()) >= 7) {
+  if (version.ok() && version.ValueOrDie().major_version() >= 7) {
     return true;
   }
 
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 7b994a4c17..c4031dfee5 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -50,6 +50,13 @@ using TypesF16F32 = ::testing::Types<Eigen::half, float>;
 using TypesF16F32F64 = ::testing::Types<Eigen::half, float, double>;
 using TypesF16F32F64CF64 =
     ::testing::Types<Eigen::half, float, double, complex64>;
+#elif !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16) && \
+    defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64) && \
+    defined(XLA_BACKEND_DOES_NOT_SUPPORT_COMPLEX)
+using TypesF16F32 = ::testing::Types<Eigen::half, float>;
+using TypesF16F32F64 = ::testing::Types<Eigen::half, float>;
+using TypesF16F32F64CF64 =
+    ::testing::Types<Eigen::half, float>;
 #else
 #error "Situation not handled yet"
 #endif
diff --git a/tensorflow/contrib/autograph/converters/call_trees.py b/tensorflow/contrib/autograph/converters/call_trees.py
index 2e5590b46c..554f0471d4 100644
--- a/tensorflow/contrib/autograph/converters/call_trees.py
+++ b/tensorflow/contrib/autograph/converters/call_trees.py
@@ -146,7 +146,7 @@ class CallTreeTransformer(transformer.Base):
       # Inspect the target function decorators. If any include a @convert
       # or @graph_ready annotation, then they must be called as they are.
       # TODO(mdan): This may be quite heavy.
-      # To parse and re-analize each function for every call site could be quite
+      # To parse and re-analyze each function for every call site could be quite
       # wasteful. Maybe we could cache the parsed AST?
       try:
         target_node, _ = parser.parse_entity(target_entity)
diff --git a/tensorflow/contrib/autograph/converters/call_trees_test.py b/tensorflow/contrib/autograph/converters/call_trees_test.py
index c666dcb73b..303dd54a4e 100644
--- a/tensorflow/contrib/autograph/converters/call_trees_test.py
+++ b/tensorflow/contrib/autograph/converters/call_trees_test.py
@@ -34,7 +34,7 @@ class CallTreesTest(converter_test_base.TestCase):
   def test_basic(self):
 
     def test_fn_1(_):
-      raise ValueError('This should not be called in the compiled verison.')
+      raise ValueError('This should not be called in the compiled version.')
 
     def renamed_test_fn_1(a):
       return a + 1
diff --git a/tensorflow/contrib/autograph/converters/decorators_test.py b/tensorflow/contrib/autograph/converters/decorators_test.py
index e67ab1cd6a..9c01f68912 100644
--- a/tensorflow/contrib/autograph/converters/decorators_test.py
+++ b/tensorflow/contrib/autograph/converters/decorators_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.platform import test
 
 # The Python parser only briefly captures decorators into the AST.
 # The interpreter desugars them on load, and the decorated function loses any
-# trace of the decorator (which is notmally what you would expect, since
+# trace of the decorator (which is normally what you would expect, since
 # they are meant to be transparent).
 # However, decorators are still visible when you analyze the function
 # from inside a decorator, before it was applied - as is the case
diff --git a/tensorflow/contrib/autograph/impl/api.py b/tensorflow/contrib/autograph/impl/api.py
index d874ef15c9..24f87b2c14 100644
--- a/tensorflow/contrib/autograph/impl/api.py
+++ b/tensorflow/contrib/autograph/impl/api.py
@@ -49,7 +49,7 @@ def convert(recursive=False, verbose=False, arg_types=None):
   function is called. This means the parameter values are known at compilation.
 
   Args:
-    recursive: Whether to recusrively convert any functions that the decorator
+    recursive: Whether to recursively convert any functions that the decorator
         function may call.
     verbose: Whether to output the compiled code in the logs.
     arg_types: See to_graph.
@@ -215,7 +215,7 @@ def to_graph(e,
 
   Args:
     e: A Python entity.
-    recursive: Whether to recusrively convert any functions that the decorator
+    recursive: Whether to recursively convert any functions that the decorator
         function may call.
     verbose: Whether to output the compiled code in the logs.
     arg_values: A dict containing value hints for symbols like function
diff --git a/tensorflow/contrib/autograph/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py
index e7230a5f45..55a30dc127 100644
--- a/tensorflow/contrib/autograph/impl/conversion.py
+++ b/tensorflow/contrib/autograph/impl/conversion.py
@@ -61,7 +61,7 @@ class ConversionMap(object):
   This object is mutable, and is updated as functions are converted.
 
   Attributes:
-    recursive: Whether to recusrively convert any functions that the decorator
+    recursive: Whether to recursively convert any functions that the decorator
         function may call.
     nocompile_decorators: tuple of decorator functions that toggle compilation
         off.
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/activity.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
index b81f5c7f87..2c14c2c8c2 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
@@ -162,11 +162,11 @@ class Scope(object):
       self.parent.mark_returned(name)
 
 
-class ActivityAnalizer(transformer.Base):
+class ActivityAnalyzer(transformer.Base):
   """Annotates nodes with local scope information. See Scope."""
 
   def __init__(self, context, parent_scope):
-    super(ActivityAnalizer, self).__init__(context)
+    super(ActivityAnalyzer, self).__init__(context)
     self.scope = Scope(parent_scope)
     self._in_return_statement = False
 
@@ -356,4 +356,4 @@ class ActivityAnalizer(transformer.Base):
 
 
 def resolve(node, context, parent_scope=None):
-  return ActivityAnalizer(context, parent_scope).visit(node)
+  return ActivityAnalyzer(context, parent_scope).visit(node)
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
index d1c4a94b14..ef79a295bf 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
@@ -108,7 +108,7 @@ class ScopeTest(test.TestCase):
     self.assertFalse(QN('a') in child.referenced)
 
 
-class ActivityAnalizerTest(test.TestCase):
+class ActivityAnalyzerTest(test.TestCase):
 
   def _parse_and_analyze(self, test_fn):
     node, source = parser.parse_entity(test_fn)
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/annos.py b/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
index d6d9f7e1a6..b929b35b79 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Annotations used by the static analizer."""
+"""Annotations used by the static analyzer."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -28,15 +28,15 @@ class NoValue(Enum):
 
 
 class NodeAnno(NoValue):
-  """Additionnal annotations used by the static analyzer.
+  """Additional annotations used by the static analyzer.
 
   These are in addition to the basic annotations declared in anno.py.
   """
 
   # Symbols
   # These flags are boolean.
-  IS_LOCAL = 'Symbol is local to the function scope being analized.'
-  IS_PARAM = 'Symbol is a parameter to the function being analized.'
+  IS_LOCAL = 'Symbol is local to the function scope being analyzed.'
+  IS_PARAM = 'Symbol is a parameter to the function being analyzed.'
   IS_MODIFIED_SINCE_ENTRY = (
       'Symbol has been explicitly replaced in the current function scope.')
 
diff --git a/tensorflow/contrib/autograph/utils/builtins.py b/tensorflow/contrib/autograph/utils/builtins.py
index dfc3c86a3d..211e8eaee9 100644
--- a/tensorflow/contrib/autograph/utils/builtins.py
+++ b/tensorflow/contrib/autograph/utils/builtins.py
@@ -77,7 +77,7 @@ def is_tf_print_compatible(value):
 
 
 def dynamic_print(*values):
-  """Implementartion of print using dynamic dispatch.
+  """Implementation of print using dynamic dispatch.
 
   The function attempts to use tf.Print if all the values are compatible.
   Otherwise, it will fall back to py_func.
diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
index d193a8459d..032b859d46 100644
--- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
@@ -44,15 +44,13 @@ def expectation_importance_sampler(f,
                                    n=None,
                                    seed=None,
                                    name='expectation_importance_sampler'):
-  r"""Monte Carlo estimate of `\\(E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]\\)`.
+  r"""Monte Carlo estimate of \\(E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]\\).
 
-  With `\\(p(z) := exp^{log_p(z)}\\)`, this `Op` returns
+  With \\(p(z) := exp^{log_p(z)}\\), this `Op` returns
 
-  ```
   \\(n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ],  z_i ~ q,\\)
   \\(\approx E_q[ f(Z) p(Z) / q(Z) ]\\)
   \\(=       E_p[f(Z)]\\)
-  ```
 
   This integral is done in log-space with max-subtraction to better handle the
   often extreme values that `f(z) p(z) / q(z)` can take on.
@@ -121,14 +119,12 @@ def expectation_importance_sampler_logspace(
     name='expectation_importance_sampler_logspace'):
   r"""Importance sampling with a positive function, in log-space.
 
-  With `\\(p(z) := exp^{log_p(z)}\\)`, and `\\(f(z) = exp{log_f(z)}\\)`,
+  With \\(p(z) := exp^{log_p(z)}\\), and \\(f(z) = exp{log_f(z)}\\),
   this `Op` returns
 
-  ```
   \\(Log[ n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ] ],  z_i ~ q,\\)
   \\(\approx Log[ E_q[ f(Z) p(Z) / q(Z) ] ]\\)
   \\(=       Log[E_p[f(Z)]]\\)
-  ```
 
   This integral is done in log-space with max-subtraction to better handle the
   often extreme values that `f(z) p(z) / q(z)` can take on.
@@ -196,13 +192,11 @@ def _logspace_mean(log_values):
 
 def expectation(f, samples, log_prob=None, use_reparametrization=True,
                 axis=0, keep_dims=False, name=None):
-  """Computes the Monte-Carlo approximation of `\\(E_p[f(X)]\\)`.
+  """Computes the Monte-Carlo approximation of \\(E_p[f(X)]\\).
 
   This function computes the Monte-Carlo approximation of an expectation, i.e.,
 
-  ```none
   \\(E_p[f(X)] \approx= m^{-1} sum_i^m f(x_j),  x_j\  ~iid\ p(X)\\)
-  ```
 
   where:
 
@@ -216,8 +210,8 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
   parameterless distribution (e.g.,
   `Normal(Y; m, s) <=> Y = sX + m, X ~ Normal(0,1)`), we can swap gradient and
   expectation, i.e.,
-  `grad[ Avg{ \\(s_i : i=1...n\\) } ] = Avg{ grad[\\(s_i\\)] : i=1...n }` where
-  `S_n = Avg{\\(s_i\\)}` and `\\(s_i = f(x_i), x_i ~ p\\)`.
+  grad[ Avg{ \\(s_i : i=1...n\\) } ] = Avg{ grad[\\(s_i\\)] : i=1...n } where
+  S_n = Avg{\\(s_i\\)}` and `\\(s_i = f(x_i), x_i ~ p\\).
 
   However, if p is not reparameterized, TensorFlow's gradient will be incorrect
   since the chain-rule stops at samples of non-reparameterized distributions.
@@ -296,7 +290,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
   Args:
     f: Python callable which can return `f(samples)`.
     samples: `Tensor` of samples used to form the Monte-Carlo approximation of
-      `\\(E_p[f(X)]\\)`.  A batch of samples should be indexed by `axis`
+      \\(E_p[f(X)]\\).  A batch of samples should be indexed by `axis`
       dimensions.
     log_prob: Python callable which can return `log_prob(samples)`. Must
       correspond to the natural-logarithm of the pdf/pmf of each sample. Only
@@ -317,7 +311,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
 
   Returns:
     approx_expectation: `Tensor` corresponding to the Monte-Carlo approximation
-      of `\\(E_p[f(X)]\\)`.
+      of \\(E_p[f(X)]\\).
 
   Raises:
     ValueError: if `f` is not a Python `callable`.
@@ -329,7 +323,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
     if not callable(f):
       raise ValueError('`f` must be a callable function.')
     if use_reparametrization:
-      return math_ops.reduce_mean(f(samples), axis=axis, keep_dims=keep_dims)
+      return math_ops.reduce_mean(f(samples), axis=axis, keepdims=keep_dims)
     else:
       if not callable(log_prob):
         raise ValueError('`log_prob` must be a callable function.')
@@ -349,7 +343,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
       # "Is there a floating point value of x, for which x-x == 0 is false?"
       # http://stackoverflow.com/q/2686644
       fx += stop(fx) * (logpx - stop(logpx))  # Add zeros_like(logpx).
-      return math_ops.reduce_mean(fx, axis=axis, keep_dims=keep_dims)
+      return math_ops.reduce_mean(fx, axis=axis, keepdims=keep_dims)
 
 
 def _sample_mean(values):
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
index 17dcb49f47..f9c22283b7 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
@@ -45,7 +45,7 @@ from tensorflow.python.platform import googletest
 def _squared_loss(label, unused_weights, predictions):
   """Unweighted loss implementation."""
   loss = math_ops.reduce_sum(
-      math_ops.square(predictions - label), 1, keep_dims=True)
+      math_ops.square(predictions - label), 1, keepdims=True)
   return loss
 
 
diff --git a/tensorflow/contrib/checkpoint/python/split_dependency_test.py b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
index cb964c80e9..f1d9d19b04 100644
--- a/tensorflow/contrib/checkpoint/python/split_dependency_test.py
+++ b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
@@ -73,7 +73,7 @@ class OnlyOneDep(checkpointable.Checkpointable):
 
 class SplitTests(test.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testSaveRestoreSplitDep(self):
     save_checkpoint = checkpointable_utils.Checkpoint(
         dep=SaveTensorSlicesAsDeps())
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index bdf3e98635..5f38a8e5c7 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -31,10 +31,14 @@ option(tensorflow_BUILD_PYTHON_TESTS "Build python unit tests " OFF)
 option(tensorflow_BUILD_MORE_PYTHON_TESTS "Build more python unit tests for contrib packages" OFF)
 option(tensorflow_BUILD_SHARED_LIB "Build TensorFlow as a shared library" OFF)
 option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for the native processor architecture (if available)" ON)
-option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions")
 option(tensorflow_ENABLE_SNAPPY_SUPPORT "Enable SNAPPY compression support" ON)
 option(tensorflow_DISABLE_EIGEN_FORCEINLINE "Disable forceinline, to speed up build on windows." OFF)
 
+# SIMD, MKL and MKLDNN options
+option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions" OFF)
+option(tensorflow_ENABLE_MKL_SUPPORT "Enable Intel MKL support" OFF)
+option(tensorflow_ENABLE_MKLDNN_SUPPORT "Enable Intel MKLDNN support, requires MKL enabled" OFF)
+
 # GPU, CUDA and cuDNN options
 option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
 set(tensorflow_CUDA_VERSION "9.0" CACHE STRING "CUDA version to build against")
@@ -124,8 +128,16 @@ endif()
 
 add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
-  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC)
-  add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
+  if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+      # 64 bits
+      add_definitions(-DWIN64)
+  elseif(CMAKE_SIZEOF_VOID_P EQUAL 4)
+      # 32 bits
+      # temporary fix for #18241
+      add_definitions(-DEIGEN_DEFAULT_DENSE_INDEX_TYPE=std::int64_t)
+  endif()
+  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11)
+  add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
   add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH)
   add_definitions(-DTF_COMPILE_LIBRARY)
   add_definitions(/bigobj /nologo /EHsc /GF /MP /Gm-)
@@ -162,12 +174,21 @@ endif()
 
 # MSVC SIMD instructions
 if (tensorflow_WIN_CPU_SIMD_OPTIONS)
+  include(CheckCXXCompilerFlag)
+  if (tensorflow_ENABLE_MKL_SUPPORT)
+    add_definitions(-DINTEL_MKL -DEIGEN_USE_VML)
+    if (NOT tensorflow_ENABLE_MKLDNN_SUPPORT)
+      add_definitions(-DINTEL_MKL_ML)
+    endif()
+  endif()
+  CHECK_CXX_COMPILER_FLAG("-fopenmp" COMPILER_OPT_OPENMP_SUPPORT)
+  if (COMPILER_OPT_OPENMP_SUPPORT)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+  endif()
   if (WIN32)
-    CHECK_CXX_COMPILER_FLAG("${tensorflow_WIN_CPU_SIMD_OPTIONS}" COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
+    CHECK_CXX_COMPILER_FLAG(${tensorflow_WIN_CPU_SIMD_OPTIONS} COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
     if(COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${tensorflow_WIN_CPU_SIMD_OPTIONS}")
-    else()
-      message(FATAL_ERROR "${tensorflow_WIN_CPU_SIMD_OPTIONS} not supported")
     endif()
   endif()
 endif()
@@ -302,6 +323,43 @@ if(HAIKU)
   list(APPEND tensorflow_EXTERNAL_LIBRARIES network)
 endif()
 
+if (tensorflow_ENABLE_MKL_SUPPORT)
+  if (WIN32)
+    find_path(MKL_HOME_PLATFORM mkl
+      PATHS ${MKL_HOME} ${MKL_HOME}/../ ${MKL_HOME}/../../
+      PATH_SUFFIXES windows)
+    set(MKL_INCLUDE_DIRS ${MKL_HOME_PLATFORM}/mkl/include)
+    set(MKL_LINK_DIRS
+      ${MKL_HOME_PLATFORM}/mkl/lib/intel64
+      ${MKL_HOME_PLATFORM}/tbb/lib/intel64/vc_mt
+      ${MKL_HOME_PLATFORM}/compiler/lib/intel64
+      ${MKL_HOME_PLATFORM}/mkl/tools/builder/lib)
+    set(MKL_REDIST_DLL_DIRS
+      ${MKL_HOME_PLATFORM}/redist/intel64/mkl
+      ${MKL_HOME_PLATFORM}/redist/intel64/tbb/vc_mt
+      ${MKL_HOME_PLATFORM}/redist/intel64/compiler)
+    list(APPEND tensorflow_EXTERNAL_LIBRARIES
+      mkl_intel_lp64_dll mkl_sequential_dll mkl_core_dll mkl_rt mkl_cdll_intel64)
+  endif()
+  if (UNIX)
+    # Fix me: complete the path on linux
+    find_path(MKL_HOME_PLATFORM mkl
+      HINTS ${MKL_HOME} ${MKL_HOME}/../ ${MKL_HOME}/../../
+      PATH_SUFFIXES linux)
+    set(MKL_INCLUDE_DIRS ${MKL_HOME_PLATFORM}/mkl/include)
+    set(MKL_LINK_DIRS) # incompleted
+    set(MKL_REDIST_SO_DIRS) # incompleted
+  endif()
+  include_directories(${MKL_INCLUDE_DIRS})
+  link_directories(${MKL_LINK_DIRS})
+  if (tensorflow_ENABLE_MKLDNN_SUPPORT)
+    include(mkldnn)
+    list(APPEND tensorflow_EXTERNAL_LIBRARIES ${mkldnn_STATIC_LIBRARIES})
+    list(APPEND tensorflow_EXTERNAL_DEPENDENCIES mkldnn)
+    include_directories(${mkldnn_INCLUDE_DIRS})
+  endif()
+endif (tensorflow_ENABLE_MKL_SUPPORT)
+
 if (tensorflow_ENABLE_GPU)
   if (NOT WIN32)
     # Default install paths for cuda libraries in Linux
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index fe83bb3204..0b79f718d4 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -128,6 +128,18 @@ Step-by-step Windows build
      D:\local\cuda\bin
      ```
 
+   * When building with MKL support after installing [MKL](https://software.intel.com/en-us/mkl) from INTEL, append its bin directories to your PATH environment variable.
+
+     In case TensorFlow fails to find the MKL dll's during initialization, check your PATH environment variable.
+     It should contain the directory of the MKL dlls. For example:
+
+     ```
+     D:\Tools\IntelSWTools\compilers_and_libraries\windows\redist\intel64\mkl
+     D:\Tools\IntelSWTools\compilers_and_libraries\windows\redist\intel64\compiler
+     D:\Tools\IntelSWTools\compilers_and_libraries\windows\redist\intel64\tbb\vc_mt
+     ```
+
+
    * We assume that `cmake` and `git` are installed and in your `%PATH%`. If
      for example `cmake` is not in your path and it is installed in
      `C:\Program Files (x86)\CMake\bin\cmake.exe`, you can add this directory
@@ -166,7 +178,15 @@ Step-by-step Windows build
    More? -Dtensorflow_ENABLE_GPU=ON ^
    More? -DCUDNN_HOME="D:\...\cudnn"
    ```
+   To build with MKL support add "^" at the end of the last line above following with:
+
+   ```
+   More? -Dtensorflow_ENABLE_MKL_SUPPORT=ON ^
+   More? -DMKL_HOME="D:\...\compilers_and_libraries"
+   ```
+
    To enable SIMD instructions with MSVC, as AVX and SSE, define it as follows:
+
    ```
    More? -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX
    ```
@@ -226,6 +246,7 @@ Step-by-step Windows build
      ```
      ctest -C RelWithDebInfo
      ```
+
    * `-Dtensorflow_BUILD_MORE_PYTHON_TESTS=(ON|OFF)`. Defaults to `OFF`. This enables python tests on
      serveral major packages. This option is only valid if this and tensorflow_BUILD_PYTHON_TESTS are both set as `ON`.
      After building the python wheel, you need to install the new wheel before running the tests.
@@ -234,6 +255,12 @@ Step-by-step Windows build
      ctest -C RelWithDebInfo
      ```
 
+   * `-Dtensorflow_ENABLE_MKL_SUPPORT=(ON|OFF)`. Defaults to `OFF`. Include MKL support. If MKL is enabled you need to install the [Intel Math Kernal Library](https://software.intel.com/en-us/mkl).
+     CMake will expect the location of MKL in -MKL_HOME=path_you_install_mkl.
+
+   * `-Dtensorflow_ENABLE_MKLDNN_SUPPORT=(ON|OFF)`. Defaults to `OFF`. Include MKL DNN support. MKL DNN is [Intel(R) Math Kernel Library for Deep Neural Networks (Intel(R) MKL-DNN)](https://github.com/intel/mkl-dnn). You have to add `-Dtensorflow_ENABLE_MKL_SUPPORT=ON` before including MKL DNN support.
+
+
 4. Invoke MSBuild to build TensorFlow.
 
    To build the C++ example program, which will be created as a `.exe`
@@ -251,6 +278,7 @@ Step-by-step Windows build
    D:\...\build> MSBuild /p:Configuration=Release tf_python_build_pip_package.vcxproj
    ```
 
+
 Linux Continuous Integration build
 ==================================
 
diff --git a/tensorflow/contrib/cmake/external/gemmlowp.cmake b/tensorflow/contrib/cmake/external/gemmlowp.cmake
index a235442dc5..cdaa6b73b9 100644
--- a/tensorflow/contrib/cmake/external/gemmlowp.cmake
+++ b/tensorflow/contrib/cmake/external/gemmlowp.cmake
@@ -14,8 +14,8 @@
 # ==============================================================================
 include (ExternalProject)
 
-set(gemmlowp_URL https://github.com/google/gemmlowp/archive/6a2a90822e8546fc2bfa7044de0faf1c1cb4862f.zip)
-set(gemmlowp_HASH SHA256=3447948d219f3270383766bbe08942888c0eb4e0ca6663c0e0548502ec5bb77d)
+set(gemmlowp_URL https://github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip)
+set(gemmlowp_HASH SHA256=b87faa7294dfcc5d678f22a59d2c01ca94ea1e2a3b488c38a95a67889ed0a658)
 set(gemmlowp_BUILD ${CMAKE_CURRENT_BINARY_DIR}/gemmlowp/src/gemmlowp)
 set(gemmlowp_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/gemmlowp/src/gemmlowp)
 
diff --git a/tensorflow/contrib/cmake/external/mkldnn.cmake b/tensorflow/contrib/cmake/external/mkldnn.cmake
new file mode 100644
index 0000000000..a639fdee36
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/mkldnn.cmake
@@ -0,0 +1,44 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+include (ExternalProject)
+
+set(mkldnn_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/include)
+set(mkldnn_URL https://github.com/01org/mkl-dnn.git)
+set(mkldnn_BUILD ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src)
+set(mkldnn_TAG 3063b2e4c943983f6bf5f2fb9a490d4a998cd291)
+
+if(WIN32)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/Release/mkldnn.lib)
+  else()
+    set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/mkldnn.lib)
+  endif()
+else()
+    set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/libmkldnn.a)
+endif()
+
+ExternalProject_Add(mkldnn
+    PREFIX mkldnn
+    GIT_REPOSITORY ${mkldnn_URL}
+    GIT_TAG ${mkldnn_TAG}
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+    BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${mkldnn_STATIC_LIBRARIES}
+    INSTALL_COMMAND ""
+    CMAKE_CACHE_ARGS
+        -DCMAKE_BUILD_TYPE:STRING=Release
+        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+        -DMKLINC:STRING=${MKL_INCLUDE_DIRS}
+)
diff --git a/tensorflow/contrib/cmake/external/png.cmake b/tensorflow/contrib/cmake/external/png.cmake
index 6cd66a6599..ad2af01bc0 100644
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@@ -15,32 +15,33 @@
 include (ExternalProject)
 
 set(png_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/png_archive)
-set(png_URL https://storage.googleapis.com/libpng-public-archive/libpng-1.2.53.tar.gz)
-set(png_HASH SHA256=e05c9056d7f323088fd7824d8c6acc03a4a758c4b4916715924edc5dd3223a72)
+set(png_URL https://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.6.34.tar.gz)
+set(png_HASH SHA256=e45ce5f68b1d80e2cb9a2b601605b374bdf51e1798ef1c2c2bd62131dfcf9eef)
 set(png_BUILD ${CMAKE_BINARY_DIR}/png/src/png)
 set(png_INSTALL ${CMAKE_BINARY_DIR}/png/install)
 
 if(WIN32)
   if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
     set(png_STATIC_LIBRARIES 
-      debug ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_staticd.lib
-      optimized ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
+      debug ${CMAKE_BINARY_DIR}/png/install/lib/libpng16_staticd.lib
+      optimized ${CMAKE_BINARY_DIR}/png/install/lib/libpng16_static.lib)
   else()
     if(CMAKE_BUILD_TYPE EQUAL Debug)
       set(png_STATIC_LIBRARIES 
-        ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_staticd.lib)
+        ${CMAKE_BINARY_DIR}/png/install/lib/libpng16_staticd.lib)
     else()
       set(png_STATIC_LIBRARIES 
-        ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
+        ${CMAKE_BINARY_DIR}/png/install/lib/libpng16_static.lib)
     endif()
   endif()
 else()
-  set(png_STATIC_LIBRARIES ${CMAKE_BINARY_DIR}/png/install/lib/libpng12.a)
+  set(png_STATIC_LIBRARIES ${CMAKE_BINARY_DIR}/png/install/lib/libpng16.a)
 endif()
 
 set(png_HEADERS
-    "${png_INSTALL}/include/libpng12/png.h"
-    "${png_INSTALL}/include/libpng12/pngconf.h"
+    "${png_INSTALL}/include/libpng16/png.h"
+    "${png_INSTALL}/include/libpng16/pngconf.h"
+    "${png_INSTALL}/include/libpng16/pnglibconf.h"
 )
 
 ExternalProject_Add(png
diff --git a/tensorflow/contrib/cmake/external/sqlite.cmake b/tensorflow/contrib/cmake/external/sqlite.cmake
index 57c4ae7651..7f835d2d51 100644
--- a/tensorflow/contrib/cmake/external/sqlite.cmake
+++ b/tensorflow/contrib/cmake/external/sqlite.cmake
@@ -15,8 +15,8 @@
 include (ExternalProject)
 
 set(sqlite_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/sqlite)
-set(sqlite_URL https://mirror.bazel.build/www.sqlite.org/2017/sqlite-amalgamation-3200000.zip)
-set(sqlite_HASH SHA256=208780b3616f9de0aeb50822b7a8f5482f6515193859e91ed61637be6ad74fd4)
+set(sqlite_URL https://mirror.bazel.build/www.sqlite.org/2018/sqlite-amalgamation-3230100.zip)
+set(sqlite_HASH SHA256=4239a1f69e5721d07d9a374eb84d594225229e54be4ee628da2995f4315d8dfc)
 set(sqlite_BUILD ${CMAKE_CURRENT_BINARY_DIR}/sqlite/src/sqlite)
 set(sqlite_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/sqlite/install)
 
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index a1c320347f..b47c32f1c4 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -276,7 +276,7 @@ add_custom_command(OUTPUT __force_rebuild COMMAND ${CMAKE_COMMAND} -E echo)
 add_custom_command(OUTPUT
     ${VERSION_INFO_CC}
     COMMAND ${PYTHON_EXECUTABLE} ${tensorflow_source_dir}/tensorflow/tools/git/gen_git_source.py
-    --raw_generate ${VERSION_INFO_CC}
+    ARGS --raw_generate ${VERSION_INFO_CC} --source_dir ${tensorflow_source_dir} --git_tag_override=${GIT_TAG_OVERRIDE}
     DEPENDS __force_rebuild)
 set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
 
@@ -341,9 +341,3 @@ add_dependencies(tf_core_framework
     tf_core_lib
     proto_text
 )
-
-if(WIN32)
-  # Cmake > 3.6 will quote this as -D"__VERSION__=\"MSVC\"" which nvcc fails on.
-  # Instead of defining this global, limit it to tf_core_framework where its used.
-  target_compile_definitions(tf_core_framework PRIVATE __VERSION__="MSVC")
-endif()
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index f6aaf41f73..c4bdb69d82 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -554,12 +554,13 @@ if(WIN32)
         set(pywrap_tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow.def")
     endif()
     set_source_files_properties(${pywrap_tensorflow_deffile} PROPERTIES GENERATED TRUE)
-
+    math(EXPR tensorflow_target_bitness "${CMAKE_SIZEOF_VOID_P}*8")
     add_custom_command(TARGET pywrap_tensorflow_internal_static POST_BUILD
         COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/tools/create_def_file.py
             --input "${pywrap_tensorflow_internal_static_dependencies}"
             --output "${pywrap_tensorflow_deffile}"
             --target _pywrap_tensorflow_internal.pyd
+            --bitness "${tensorflow_target_bitness}"
         BYPRODUCTS ${pywrap_tensorflow_deffile} # Required for Ninja
     )
 endif(WIN32)
@@ -589,6 +590,12 @@ add_library(pywrap_tensorflow_internal SHARED
     ${pywrap_tensorflow_deffile}
 )
 
+# There is a bug in GCC 5 resulting in undefined reference to a __cpu_model function when
+# linking to the tensorflow library. Adding the following libraries fixes it.
+if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
+    target_link_libraries(pywrap_tensorflow_internal PRIVATE gcc_s gcc)
+endif()
+
 if(WIN32)
     add_dependencies(pywrap_tensorflow_internal pywrap_tensorflow_internal_static)
 endif(WIN32)
diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake
index 9738bbeb9a..38f40452b5 100644
--- a/tensorflow/contrib/cmake/tf_shared_lib.cmake
+++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake
@@ -52,12 +52,13 @@ if(WIN32)
     set(tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/tensorflow.def")
   endif()
   set_source_files_properties(${tensorflow_deffile} PROPERTIES GENERATED TRUE)
-
+  math(EXPR tensorflow_target_bitness "${CMAKE_SIZEOF_VOID_P}*8")
   add_custom_command(TARGET tensorflow_static POST_BUILD
       COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/tools/create_def_file.py
           --input "${tensorflow_static_dependencies}"
           --output "${tensorflow_deffile}"
           --target tensorflow.dll
+          --bitness "${tensorflow_target_bitness}"
   )
 endif(WIN32)
 
diff --git a/tensorflow/contrib/cmake/tf_stream_executor.cmake b/tensorflow/contrib/cmake/tf_stream_executor.cmake
index 91ca33f4c4..af48ef1fd4 100644
--- a/tensorflow/contrib/cmake/tf_stream_executor.cmake
+++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake
@@ -65,6 +65,12 @@ if (tensorflow_ENABLE_GPU)
     file(GLOB tf_stream_executor_gpu_srcs
         "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*.cc"
     )
+    if (NOT tensorflow_BUILD_CC_TESTS)
+        file(GLOB tf_stream_executor_gpu_tests
+            "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*_test.cc"
+        )
+        list(REMOVE_ITEM tf_stream_executor_gpu_srcs ${tf_stream_executor_gpu_tests})
+    endif()
     list(APPEND tf_stream_executor_srcs ${tf_stream_executor_gpu_srcs})
 endif()
 
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
index 53c2285699..cffe069aa3 100644
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -63,7 +63,7 @@ INCLUDE_RE = re.compile(r"^(TF_\w*)$|"
                         r"^(TFE_\w*)$|"
                         r"tensorflow::|"
                         r"functor::|"
-                        r"nsync_|"
+                        r"\?nsync_|"
                         r"perftools::gputools")
 
 # We want to identify data members explicitly in the DEF file, so that no one
@@ -87,6 +87,7 @@ def get_args():
                       required=True)
   parser.add_argument("--output", help="output deffile", required=True)
   parser.add_argument("--target", help="name of the target", required=True)
+  parser.add_argument("--bitness", help="build target bitness", required=True)
   args = parser.parse_args()
   return args
 
@@ -125,7 +126,10 @@ def main():
     # Header for the def file.
     def_fp.write("LIBRARY " + args.target + "\n")
     def_fp.write("EXPORTS\n")
-    def_fp.write("\t ??1OpDef@tensorflow@@UEAA@XZ\n")
+    if args.bitness == "64":
+      def_fp.write("\t??1OpDef@tensorflow@@UEAA@XZ\n")
+    else:
+      def_fp.write("\t??1OpDef@tensorflow@@UAE@XZ\n")
 
     # Each symbols returned by undname matches the same position in candidates.
     # We compare on undname but use the decorated name from candidates.
diff --git a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
index 721dc4d080..a5e065b93a 100644
--- a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
+++ b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
@@ -281,6 +281,21 @@ class CrfTest(test.TestCase):
         self.assertEqual(list(tf_actual_max_sequence[:sequence_lengths]),
                          expected_max_sequence[:sequence_lengths])
 
+  def testCrfDecodeZeroSeqLength(self):
+    """
+    Test that crf_decode works when sequence_length contains one or more zeros.
+    """
+    with self.test_session() as sess:
+      inputs = constant_op.constant(np.ones([2, 10, 5],
+                                            dtype=np.float32))
+      transition_params = constant_op.constant(np.ones([5, 5],
+                                                       dtype=np.float32))
+      sequence_lengths = constant_op.constant(np.zeros([2],
+                                                       dtype=np.int32))
+      values = crf.crf_decode(inputs, transition_params, sequence_lengths)
+      tags, scores = sess.run(values)
+      self.assertEqual(len(tags.shape), 2)
+      self.assertEqual(len(scores.shape), 1)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index 1233c8f251..e37c029ceb 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -479,15 +479,17 @@ def crf_decode(potentials, transition_params, sequence_length):
     initial_state = array_ops.slice(potentials, [0, 0, 0], [-1, 1, -1])
     initial_state = array_ops.squeeze(initial_state, axis=[1])  # [B, O]
     inputs = array_ops.slice(potentials, [0, 1, 0], [-1, -1, -1])  # [B, T-1, O]
+    # sequence length is not allowed to be less than zero
+    sequence_length_less_one = math_ops.maximum(0, sequence_length - 1)
     backpointers, last_score = rnn.dynamic_rnn(  # [B, T - 1, O], [B, O]
         crf_fwd_cell,
         inputs=inputs,
-        sequence_length=sequence_length - 1,
+        sequence_length=sequence_length_less_one,
         initial_state=initial_state,
         time_major=False,
         dtype=dtypes.int32)
     backpointers = gen_array_ops.reverse_sequence(  # [B, T - 1, O]
-        backpointers, sequence_length - 1, seq_dim=1)
+        backpointers, sequence_length_less_one, seq_dim=1)
 
     # Computes backward decoding. Extract tag indices from backpointers.
     crf_bwd_cell = CrfDecodeBackwardRnnCell(num_tags)
@@ -497,7 +499,7 @@ def crf_decode(potentials, transition_params, sequence_length):
     decode_tags, _ = rnn.dynamic_rnn(  # [B, T - 1, 1]
         crf_bwd_cell,
         inputs=backpointers,
-        sequence_length=sequence_length - 1,
+        sequence_length=sequence_length_less_one,
         initial_state=initial_state,
         time_major=False,
         dtype=dtypes.int32)
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
index 00d9544602..d58198faf3 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
@@ -358,7 +358,8 @@ class _CudnnRNN(base_layer.Layer):
             "CUDA/CuDNN generations.")
       # Initialize opaque params with a tensor.
       self.kernel = vs.get_variable(
-          "opaque_kernel", initializer=opaque_params_t, validate_shape=False)
+          "opaque_kernel", dtype=self._plain_dtype,
+          initializer=opaque_params_t, validate_shape=False)
     # Create saveable in the outer scope of the cudnn subgraph, such that
     # alternative subgraph with platform-independent rnn cells can load the
     # checkpoints directly.
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 9d1e8b20c2..d59dd17aea 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -4,7 +4,7 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test", "py_test", "tf_py_test")
 
 py_test(
     name = "batch_dataset_op_test",
@@ -482,12 +482,11 @@ py_test(
     ],
 )
 
-py_test(
+cuda_py_test(
     name = "prefetching_ops_test",
     size = "small",
     srcs = ["prefetching_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         "//tensorflow/contrib/data/python/ops:prefetching_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
index dbc35097dd..78ecce8f7d 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
@@ -163,7 +163,7 @@ class DatasetSerializationTestBase(test.TestCase):
                                  num_outputs,
                                  sparse_tensors=False,
                                  verify_exhausted=True):
-    """Verifies that restoring into an already initilized iterator works.
+    """Verifies that restoring into an already initialized iterator works.
 
     Args:
       ds_fn: See `run_core_tests`.
diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
index f8556a1b28..43aa4b1bd0 100644
--- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
@@ -409,7 +409,7 @@ class ParallelInterleaveDatasetTest(test.TestCase):
   def _testTwoThreadsNoContentionWithRaces(self, sloppy=False):
     """Tests where all the workers race in producing elements.
 
-    Note: this is in contrast with the prevous test which carefully sequences
+    Note: this is in contrast with the previous test which carefully sequences
     the execution of the map functions.
 
     Args:
@@ -495,7 +495,7 @@ class ParallelInterleaveDatasetTest(test.TestCase):
   def _testTwoThreadsNoContentionWithRacesAndBlocking(self, sloppy=False):
     """Tests where all the workers race in producing elements.
 
-    Note: this is in contrast with the prevous test which carefully sequences
+    Note: this is in contrast with the previous test which carefully sequences
     the execution of the map functions.
 
 
@@ -928,8 +928,7 @@ class DirectedInterleaveDatasetTest(test.TestCase):
         sess.run(next_element)
 
   def _normalize(self, vec):
-    batched = (len(vec.shape) == 2)
-    return vec / vec.sum(axis=1, keepdims=True) if batched else vec / vec.sum()
+    return vec / vec.sum()
 
   def _chi2(self, expected, actual):
     actual = np.asarray(actual)
@@ -938,35 +937,43 @@ class DirectedInterleaveDatasetTest(test.TestCase):
     chi2 = np.sum(diff * diff / expected, axis=0)
     return chi2
 
+  def _testSampleFromDatasetsHelper(self, weights, num_datasets, num_samples):
+    # Create a dataset that samples each integer in `[0, num_datasets)`
+    # with probability given by `weights[i]`.
+    dataset = interleave_ops.sample_from_datasets([
+        dataset_ops.Dataset.from_tensors(i).repeat(None)
+        for i in range(num_datasets)
+    ], weights)
+    dataset = dataset.take(num_samples)
+    iterator = dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      freqs = np.zeros([num_datasets])
+      for _ in range(num_samples):
+        freqs[sess.run(next_element)] += 1
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+    return freqs
+
   def testSampleFromDatasets(self):
-    random_seed.set_random_seed(1618)
+    random_seed.set_random_seed(1619)
     num_samples = 10000
-    rand_probs = self._normalize(np.random.random_sample((10,)))
-    rand_probs2 = self._normalize(np.random.random_sample((15,)))
+    rand_probs = self._normalize(np.random.random_sample((15,)))
 
-    for probs in [[.5, .5], [.85, .05, .1], rand_probs, rand_probs2]:
+    # Use chi-squared test to assert that the observed distribution matches the
+    # expected distribution. Based on the implementation in
+    # "tensorflow/python/kernel_tests/multinomial_op_test.py".
+    for probs in [[.85, .05, .1], rand_probs]:
       probs = np.asarray(probs)
+      classes = len(probs)
+      freqs = self._testSampleFromDatasetsHelper(probs, classes, num_samples)
+      self.assertLess(self._chi2(probs, freqs / num_samples), 1e-3)
 
-      # Create a dataset that samples each integer in `[0, probs.shape[0])`
-      # with probability given by `probs[i]`.
-      dataset = interleave_ops.sample_from_datasets([
-          dataset_ops.Dataset.from_tensors(i).repeat(None)
-          for i in range(probs.shape[0])
-      ], probs)
-      dataset = dataset.take(num_samples)
-      iterator = dataset.make_one_shot_iterator()
-      next_element = iterator.get_next()
-
-      with self.test_session() as sess:
-        freqs = np.zeros_like(probs)
-        for _ in range(num_samples):
-          freqs[sess.run(next_element)] += 1
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
-
-      # Use chi-squared test to assert that the observed distribution
-      # matches the expected distribution. Based on the implementation
-      # in "tensorflow/python/kernel_tests/multinomial_op_test.py".
+      # Also check that `weights` as a dataset samples correctly.
+      probs_ds = dataset_ops.Dataset.from_tensors(probs).repeat()
+      freqs = self._testSampleFromDatasetsHelper(probs_ds, classes, num_samples)
       self.assertLess(self._chi2(probs, freqs / num_samples), 1e-3)
 
   def testErrors(self):
diff --git a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
index 7acbc676ce..5c74ed6ae7 100644
--- a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
@@ -201,6 +201,14 @@ class StatsDatasetSerializationTest(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
             stats_ops.bytes_produced_stats("bytes_produced"))
 
+  def test_bytes_produced_stats_invalid_tag_shape(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(
+          lambda: dataset_ops.Dataset.range(100).apply(
+              stats_ops.bytes_produced_stats(["bytes_produced"])),
+          None, 100)
+
   def testBytesStatsDatasetSaveableCore(self):
     num_outputs = 100
     self.run_core_tests(
@@ -218,6 +226,14 @@ class StatsDatasetSerializationTest(
     return dataset_ops.Dataset.range(num_elements).apply(
         stats_ops.latency_stats(tag1)).apply(stats_ops.latency_stats(tag2))
 
+  def test_latency_stats_invalid_tag_shape(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(
+          lambda: dataset_ops.Dataset.range(100).apply(
+              stats_ops.latency_stats(["record_latency", "record_latency_2"])),
+          None, 100)
+
   def testLatencyStatsDatasetSaveableCore(self):
     num_outputs = 100
 
diff --git a/tensorflow/contrib/data/python/ops/interleave_ops.py b/tensorflow/contrib/data/python/ops/interleave_ops.py
index 106a1ef388..812a50ecbf 100644
--- a/tensorflow/contrib/data/python/ops/interleave_ops.py
+++ b/tensorflow/contrib/data/python/ops/interleave_ops.py
@@ -200,10 +200,11 @@ def sample_from_datasets(datasets, weights=None, seed=None):
 
   Args:
     datasets: A list of @{tf.data.Dataset} objects with compatible structure.
-    weights: (Optional.) A list of `len(datasets)` floating-point values,
-      where `weights[i]` represents the probability with which an element
-      should be sampled from `datasets[i]`. Defaults to a uniform distribution
-      across `datasets`.
+    weights: (Optional.) A list of `len(datasets)` floating-point values where
+      `weights[i]` represents the probability with which an element should be
+      sampled from `datasets[i]`, or a @{tf.data.Dataset} object where each
+      element is such a list. Defaults to a uniform distribution across
+      `datasets`.
     seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
       random seed that will be used to create the distribution. See
       @{tf.set_random_seed} for behavior.
@@ -219,24 +220,23 @@ def sample_from_datasets(datasets, weights=None, seed=None):
   """
   num_datasets = len(datasets)
   if weights is None:
-    weights = array_ops.ones(
-        [num_datasets], dtype=dtypes.float32, name="weights")
-  else:
+    weights = dataset_ops.Dataset.from_tensors([1.0] * num_datasets).repeat()
+  elif not isinstance(weights, dataset_ops.Dataset):
     weights = ops.convert_to_tensor(weights, name="weights")
     if weights.dtype not in (dtypes.float32, dtypes.float64):
       raise TypeError("`weights` must be convertible to a tensor of "
                       "`tf.float32` or `tf.float64` elements.")
     if not weights.shape.is_compatible_with([num_datasets]):
       raise ValueError("`weights` must be a vector of length `len(datasets)`.")
+    weights = dataset_ops.Dataset.from_tensors(weights).repeat()
 
   # The `stateless_multinomial()` op expects log-probabilities, as opposed to
   # weights.
-  logits = math_ops.log(weights, name="logits")
-
-  def select_dataset(seed):
+  logits_ds = weights.map(lambda *p: math_ops.log(p, name="logits"))
+  def select_dataset(logits, seed):
     return array_ops.squeeze(
-        stateless.stateless_multinomial([logits], 1, seed=seed), axis=[0, 1])
-
-  selector_input = random_ops.RandomDataset(seed).batch(2).map(select_dataset)
+        stateless.stateless_multinomial(logits, 1, seed=seed), axis=[0, 1])
+  selector_input = dataset_ops.Dataset.zip(
+      (logits_ds, random_ops.RandomDataset(seed).batch(2))).map(select_dataset)
 
   return DirectedInterleaveDataset(selector_input, datasets)
diff --git a/tensorflow/contrib/data/python/ops/prefetching_ops.py b/tensorflow/contrib/data/python/ops/prefetching_ops.py
index 89c04dc89a..e4c9f8b58a 100644
--- a/tensorflow/contrib/data/python/ops/prefetching_ops.py
+++ b/tensorflow/contrib/data/python/ops/prefetching_ops.py
@@ -114,11 +114,13 @@ class _PrefetchToDeviceIterator(object):
       ret = remote_iterator.get_next()
       return nest.flatten(sparse.serialize_sparse_tensors(ret))
 
+    iterator_device = gen_dataset_ops.iterator_get_device(
+        self._input_iterator._iterator_resource)
+
     with ops.device(device):
       self._buffering_resource = function_buffering_resource(
           f=_prefetch_fn,
-          target_device=gen_dataset_ops.iterator_get_device(
-              self._input_iterator._iterator_resource),
+          target_device=iterator_device,
           string_arg=input_iterator_handle,
           buffer_size=buffer_size,
           shared_name=shared_name)
diff --git a/tensorflow/contrib/data/python/ops/scan_ops.py b/tensorflow/contrib/data/python/ops/scan_ops.py
index 711a538697..60ef7efba4 100644
--- a/tensorflow/contrib/data/python/ops/scan_ops.py
+++ b/tensorflow/contrib/data/python/ops/scan_ops.py
@@ -57,7 +57,7 @@ class _ScanDataset(dataset_ops.Dataset):
     self._output_shapes = None
     self._output_types = None
 
-    # Iteratively rerun the scan function until reaching a fixed pont on
+    # Iteratively rerun the scan function until reaching a fixed point on
     # `self._state_shapes`.
     need_to_rerun = True
     while need_to_rerun:
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py
index c8d795c3f6..243b5a0348 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py
@@ -584,7 +584,6 @@ class DistributionShapeTest(test.TestCase):
 
   def testDistributionShapeGetDimsStatic(self):
     with self.test_session():
-      shaper = _DistributionShape(batch_ndims=0, event_ndims=0)
       shaper = _DistributionShape(batch_ndims=0, event_ndims=0)
       x = 1
       self.assertAllEqual((_empty_shape, _empty_shape, _empty_shape),
diff --git a/tensorflow/contrib/eager/python/saver_test.py b/tensorflow/contrib/eager/python/saver_test.py
index 1a7f7b85e6..4032e755f6 100644
--- a/tensorflow/contrib/eager/python/saver_test.py
+++ b/tensorflow/contrib/eager/python/saver_test.py
@@ -102,7 +102,6 @@ class SaverTest(test.TestCase):
       # Can still restore it.
       saver.restore(ckpt_prefix)
       self.assertEqual(v1.read_value().numpy(), 1.0)
-      self.assertEqual(v1.read_value().numpy(), 1.0)
       # However, cannot restore it with default name.
       with self.assertRaisesOpError('not found in checkpoint'):
         saver = _saver.Saver([v1, v2]).restore(ckpt_prefix)
diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index ae2fd8b490..3dcf0374c8 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -485,7 +485,7 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
           reduction=losses.Reduction.NONE)
       # Averages loss over classes.
       unweighted_loss = math_ops.reduce_mean(
-          unweighted_loss, axis=-1, keep_dims=True)
+          unweighted_loss, axis=-1, keepdims=True)
     weights = head_lib._get_weights_and_check_match_logits(  # pylint:disable=protected-access,
         features=features, weight_column=self._weight_column, logits=logits)
     training_loss = losses.compute_weighted_loss(
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
index fa2697800e..a8774d6dab 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
@@ -456,7 +456,7 @@ def _get_local_devices(device_type):
 
 
 def _split_batch(features, labels, number_of_shards, device):
-  """Split input features and labes into batches."""
+  """Split input features and labels into batches."""
 
   def ensure_divisible_by_shards(sequence):
     batch_size = ops_lib.convert_to_tensor(sequence).get_shape()[0]
@@ -602,7 +602,7 @@ def _local_device_setter(worker_device, ps_devices, ps_strategy):
 
 
 def _scale_tower_loss(tower_spec, loss_reduction, number_of_towers):
-  """Produce an EstimatorSpec with approproriately scaled loss."""
+  """Produce an EstimatorSpec with appropriately scaled loss."""
   if tower_spec.loss is None:
     return tower_spec
 
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
index 5d77bc77e1..ccdd679d6a 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -54,10 +54,10 @@ def _covariance(x, diag):
   diagonal matrix just the diagonal is returned.
   """
   num_points = math_ops.to_float(array_ops.shape(x)[0])
-  x -= math_ops.reduce_mean(x, 0, keep_dims=True)
+  x -= math_ops.reduce_mean(x, 0, keepdims=True)
   if diag:
     cov = math_ops.reduce_sum(
-        math_ops.square(x), 0, keep_dims=True) / (num_points - 1)
+        math_ops.square(x), 0, keepdims=True) / (num_points - 1)
   else:
     cov = math_ops.matmul(x, x, transpose_a=True) / (num_points - 1)
   return cov
@@ -313,7 +313,7 @@ class GmmAlgorithm(object):
     # TODO(xavigonzalvo): look into alternatives to log for
     # reparametrization of variance parameters.
     det_expanded = math_ops.reduce_sum(
-        math_ops.log(self._covs + 1e-3), 1, keep_dims=True)
+        math_ops.log(self._covs + 1e-3), 1, keepdims=True)
     diff = shard - self._means
     x2 = math_ops.square(diff)
     cov_expanded = array_ops.expand_dims(1.0 / (self._covs + 1e-3), 2)
@@ -351,7 +351,7 @@ class GmmAlgorithm(object):
       shard_id: id of current shard_id.
     """
     self._prior_probs[shard_id] = math_ops.reduce_logsumexp(
-        self._probs[shard_id], axis=1, keep_dims=True)
+        self._probs[shard_id], axis=1, keepdims=True)
 
   def _define_expectation_operation(self, shard_id):
     # Shape broadcasting.
@@ -375,7 +375,7 @@ class GmmAlgorithm(object):
     """
     # Soft assignment of each data point to each of the two clusters.
     self._points_in_k[shard_id] = math_ops.reduce_sum(
-        self._w[shard_id], 0, keep_dims=True)
+        self._w[shard_id], 0, keepdims=True)
     # Partial means.
     w_mul_x = array_ops.expand_dims(
         math_ops.matmul(
@@ -454,7 +454,7 @@ class GmmAlgorithm(object):
     for shard_id, prior_probs in enumerate(self._prior_probs):
       op.append(prior_probs + math_ops.log(self._w[shard_id]))
     self._scores = array_ops.squeeze(
-        math_ops.reduce_logsumexp(op, axis=2, keep_dims=True), axis=0)
+        math_ops.reduce_logsumexp(op, axis=2, keepdims=True), axis=0)
 
 
 def gmm(inp,
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py
index bfe338c9f9..9ffdd3ba5e 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@@ -374,11 +374,11 @@ class KMeansClustering(estimator.Estimator):
               than `num_clusters`, a TensorFlow runtime error occurs.
       distance_metric: The distance metric used for clustering. One of:
         * `KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE`: Euclidean distance
-             between vectors `u` and `v` is defined as `\\(||u - v||_2\\)`
+             between vectors `u` and `v` is defined as \\(||u - v||_2\\)
              which is the square root of the sum of the absolute squares of
              the elements' difference.
         * `KMeansClustering.COSINE_DISTANCE`: Cosine distance between vectors
-             `u` and `v` is defined as `\\(1 - (u . v) / (||u||_2 ||v||_2)\\)`.
+             `u` and `v` is defined as \\(1 - (u . v) / (||u||_2 ||v||_2)\\).
       random_seed: Python integer. Seed for PRNG used to initialize centers.
       use_mini_batch: A boolean specifying whether to use the mini-batch k-means
         algorithm. See explanation above.
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index bb4f1eb384..11397e86bd 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -118,12 +118,13 @@ from tensorflow.python.framework.smart_cond import smart_cond
 from tensorflow.python.framework.smart_cond import smart_constant_value
 from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
 from tensorflow.python.framework.tensor_spec import TensorSpec
+from tensorflow.python.ops.array_ops import broadcast_to
 from tensorflow.python.ops.init_ops import convolutional_delta_orthogonal
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_1d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_2d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_3d
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['nest']
+_allowed_symbols = ['nest', 'broadcast_to']
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/framework/python/framework/tensor_util_test.py b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
index a2834b6489..8fc4f60492 100644
--- a/tensorflow/contrib/framework/python/framework/tensor_util_test.py
+++ b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
@@ -48,7 +48,7 @@ class LocalVariabletest(test.TestCase):
       variables = variables_lib.local_variables()
       self.assertEquals(2, len(variables))
       self.assertRaises(errors_impl.OpError, sess.run, variables)
-      variables_lib.initialize_variables(variables).run()
+      variables_lib.variables_initializer(variables).run()
       self.assertAllEqual(set([value0, value1]), set(sess.run(variables)))
 
 
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
index a97adf622e..983b6dc8e5 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
@@ -65,7 +65,7 @@ def fused_conv2d_bias_activation(conv_input,
     side_input_scale: A scalar `float32` that will be multiplied by side_input.
         This is optional and defaults to 0.
     side_input: A `Tensor` of the format specified by `data_format`.
-        This is useful for imlementing ResNet blocks.
+        This is useful for implementing ResNet blocks.
     activation_mode: (optional) currently must be the default "Relu".
         Note that in qint8 mode, it also clips to 127, so acts like ReluX.
     data_format: Specifies the data format.
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
index bb155aa249..3d0ed89932 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
@@ -566,7 +566,7 @@ def GetInceptionFwdTest(input_size, filter_size, stride, padding,
   return Test
 
 
-def CalculateCovolvedOutputDim(input_dim, filter_dim, stride, padding_type):
+def CalculateConvolvedOutputDim(input_dim, filter_dim, stride, padding_type):
   """Calculates the size of an output dimension of a strided convolution.
 
   Given the sizes of the corresponding dimension of the input and filter shapes,
@@ -827,10 +827,10 @@ class FusedConvInt8Tests(test.TestCase):
             maxval=1.0,
             dtype=dtypes.float32), -1.0, 1.0, dtypes.qint8)
 
-    output_height = CalculateCovolvedOutputDim(input_height, filter_height,
-                                               vertical_stride, padding_type)
-    output_width = CalculateCovolvedOutputDim(input_width, filter_width,
-                                              horizontal_stride, padding_type)
+    output_height = CalculateConvolvedOutputDim(input_height, filter_height,
+                                                vertical_stride, padding_type)
+    output_width = CalculateConvolvedOutputDim(input_width, filter_width,
+                                               horizontal_stride, padding_type)
     print("output_height=", output_height, ", output_width=", output_width)
 
     side_input, _, _ = gen_array_ops.quantize_v2(
diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
index 4b10bc0f8e..4b1105f6bd 100644
--- a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
@@ -161,7 +161,7 @@ def _sliced_wasserstein(a, b, random_sampling_count, random_projection_dim):
     proj = random_ops.random_normal(
         [array_ops.shape(a)[1], random_projection_dim])
     proj *= math_ops.rsqrt(
-        math_ops.reduce_sum(math_ops.square(proj), 0, keep_dims=True))
+        math_ops.reduce_sum(math_ops.square(proj), 0, keepdims=True))
     # Project both distributions and sort them.
     proj_a = math_ops.matmul(a, proj)
     proj_b = math_ops.matmul(b, proj)
diff --git a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py
index f8b372546b..650eab97a3 100644
--- a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py
+++ b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py
@@ -64,11 +64,11 @@ def _statistics(x, axes):
   y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x
 
   # Compute true mean while keeping the dims for proper broadcasting.
-  shift = array_ops.stop_gradient(math_ops.reduce_mean(y, axes, keep_dims=True))
+  shift = array_ops.stop_gradient(math_ops.reduce_mean(y, axes, keepdims=True))
 
-  shifted_mean = math_ops.reduce_mean(y - shift, axes, keep_dims=True)
+  shifted_mean = math_ops.reduce_mean(y - shift, axes, keepdims=True)
   mean = shifted_mean + shift
-  mean_squared = math_ops.reduce_mean(math_ops.square(y), axes, keep_dims=True)
+  mean_squared = math_ops.reduce_mean(math_ops.square(y), axes, keepdims=True)
 
   mean = array_ops.squeeze(mean, axes)
   mean_squared = array_ops.squeeze(mean_squared, axes)
diff --git a/tensorflow/contrib/hvx/README.md b/tensorflow/contrib/hvx/README.md
index 163993a3f6..68e34f3b09 100644
--- a/tensorflow/contrib/hvx/README.md
+++ b/tensorflow/contrib/hvx/README.md
@@ -42,11 +42,12 @@ If you've finished walking through the quick start guide, you may want to try bu
 
 ### Build libhexagon\_nn\_skel.so
 
-Download Hexagon NN library from codeaurora.org and build it.
+Download Hexagon NN library from codeaurora.org and build it. For Hexagon SDK 3.0, we need use the compatible version([721b2d58f](https://source.codeaurora.org/quic/hexagon_nn/nnlib/commit/?id=721b2d58f0f4e2d5b182f41e6b7c4db5356bf0fb)) of nnlib.
 
 ```shell
 git clone https://source.codeaurora.org/quic/hexagon_nn/nnlib
 cd nnlib
+git reset 721b2d58f --hard
 ```
 
 Just follow the instructions in `README.HOW_TO_BUILD`. You can find the file `libhexagon_nn_skel.so` in `hexagon_Release_dynamic_toolv72_v60/ship`.
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
index 1be97ae3d6..bbb3a3b18f 100644
--- a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
@@ -53,7 +53,7 @@ void AdjustHsvInYiqGPU::operator()(OpKernelContext* ctx, int channel_count,
   OP_REQUIRES_OK(ctx, ctx->allocate_temp(
                           DT_FLOAT, TensorShape({kChannelSize * kChannelSize}),
                           &tranformation_matrix));
-  // TODO(huangyp): It takes about 3.5 us to comute tranformation_matrix
+  // TODO(huangyp): It takes about 3.5 us to compute tranformation_matrix
   // with one thread. Improve its performance if necessary.
   internal::compute_tranformation_matrix_cuda<<<1, 1, 0, cu_stream>>>(
       delta_h, scale_s, scale_v, tranformation_matrix.flat<float>().data(),
diff --git a/tensorflow/contrib/image/ops/distort_image_ops.cc b/tensorflow/contrib/image/ops/distort_image_ops.cc
index b169b0b2b2..ca49635d5d 100644
--- a/tensorflow/contrib/image/ops/distort_image_ops.cc
+++ b/tensorflow/contrib/image/ops/distort_image_ops.cc
@@ -36,9 +36,9 @@ REGISTER_OP("AdjustHsvInYiq")
 Adjust the YIQ hue of one or more images.
 
 `images` is a tensor of at least 3 dimensions.  The last dimension is
-interpretted as channels, and must be three.
+interpreted as channels, and must be three.
 
-We used linear transfomation described in:
+We used linear transformation described in:
  beesbuzz.biz/code/hsv_color_transforms.php
 The input image is considered in the RGB colorspace. Conceptually, the RGB
 colors are first mapped into YIQ space, rotated around the Y channel by
diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc
index e97267fb89..295908d44b 100644
--- a/tensorflow/contrib/image/ops/image_ops.cc
+++ b/tensorflow/contrib/image/ops/image_ops.cc
@@ -137,7 +137,7 @@ row_to_col_match_indices: A vector of length num_rows, which is the number of
   If `row_to_col_match_indices[i]` is not -1, row i is matched to column
   `row_to_col_match_indices[i]`.
 col_to_row_match_indices: A vector of length num_columns, which is the number
-  of columns of the input ditance matrix.
+  of columns of the input distance matrix.
   If `col_to_row_match_indices[j]` is not -1, column j is matched to row
   `col_to_row_match_indices[j]`.
 )doc");
diff --git a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
index 8139d4272d..bd784c6bda 100755
--- a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
+++ b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
@@ -69,7 +69,7 @@ Outputs a single image random dot stereogram for export via encode_PNG/JPG OP.
 Given the 2-D tensor 'depth_values' with encoded Z values, this operation will
 encode 3-D data into a 2-D image.  The output of this Op is suitable for the
 encode_PNG/JPG ops.  Be careful with image compression as this may corrupt the
-encode 3-D data witin the image.
+encode 3-D data within the image.
 
 This Op is based upon:
 'http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper'
@@ -111,7 +111,7 @@ output_image_shape: Output size of returned image in X,Y, Channels 1-grayscale,
 output_data_window: Size of "DATA" window, must be equal to or smaller than 'output_image_shape', will be centered
   and use 'convergence_dots_size' for best fit to avoid overlap if possible
 
-image:= A tensor of size 'output_image_shape' with the encloded 'depth_values'
+image:= A tensor of size 'output_image_shape' with the encoded 'depth_values'
 )doc");
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index a8d8cf8c5c..d3c114a88d 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -438,7 +438,7 @@ def bipartite_match(distance_mat,
       of rows of the input `distance_matrix`. If `row_to_col_match_indices[i]`
       is not -1, row i is matched to column `row_to_col_match_indices[i]`.
     col_to_row_match_indices: A vector of length num_columns, which is the
-      number of columns of the input ditance matrix.
+      number of columns of the input distance matrix.
       If `col_to_row_match_indices[j]` is not -1, column j is matched to row
       `col_to_row_match_indices[j]`.
   """
diff --git a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
index d4a6a5bcbb..0ceb683ff4 100755
--- a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
+++ b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
@@ -45,7 +45,7 @@ def single_image_random_dot_stereograms(depth_values,
   Given the 2-D tensor 'depth_values' with encoded Z values, this operation
   will encode 3-D data into a 2-D image.  The output of this Op is suitable
   for the encode_PNG/JPG ops.  Be careful with image compression as this may
-  corrupt the encode 3-D data witin the image.
+  corrupt the encode 3-D data within the image.
 
   Based upon [this
   paper](http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper).
diff --git a/tensorflow/contrib/kfac/python/ops/loss_functions.py b/tensorflow/contrib/kfac/python/ops/loss_functions.py
index e7d4243fc3..42d525c2c2 100644
--- a/tensorflow/contrib/kfac/python/ops/loss_functions.py
+++ b/tensorflow/contrib/kfac/python/ops/loss_functions.py
@@ -613,19 +613,19 @@ class CategoricalLogitsNegativeLogProbLoss(DistributionNegativeLogProbLoss,
   def multiply_fisher(self, vector):
     probs = self._probs
     return vector * probs - probs * math_ops.reduce_sum(
-        vector * probs, axis=-1, keep_dims=True)
+        vector * probs, axis=-1, keepdims=True)
 
   def multiply_fisher_factor(self, vector):
     probs = self._probs
     sqrt_probs = self._sqrt_probs
     return sqrt_probs * vector - probs * math_ops.reduce_sum(
-        sqrt_probs * vector, axis=-1, keep_dims=True)
+        sqrt_probs * vector, axis=-1, keepdims=True)
 
   def multiply_fisher_factor_transpose(self, vector):
     probs = self._probs
     sqrt_probs = self._sqrt_probs
     return sqrt_probs * vector - sqrt_probs * math_ops.reduce_sum(
-        probs * vector, axis=-1, keep_dims=True)
+        probs * vector, axis=-1, keepdims=True)
 
   def multiply_fisher_factor_replicated_one_hot(self, index):
     assert len(index) == 1, "Length of index was {}".format(len(index))
diff --git a/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py b/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py
index 705a871d48..4279cb2792 100644
--- a/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py
+++ b/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py
@@ -33,7 +33,6 @@ _allowed_symbols = [
     "CategoricalLogitsNegativeLogProbLoss",
     "OnehotCategoricalLogitsNegativeLogProbLoss",
     "MultiBernoulliNegativeLogProbLoss",
-    "MultiBernoulliNegativeLogProbLoss",
     "insert_slice_in_zeros",
 ]
 
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py b/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
index 0727f4cf88..39e9d65407 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
@@ -660,7 +660,7 @@ class ReduceSumTest(Base):
     sum_lt = ops.reduce_sum(self.original_lt, {('channel', 'hihowareyou')})
     golden_lt = core.LabeledTensor(
         math_ops.reduce_sum(
-            self.original_lt.tensor, 1, keep_dims=True),
+            self.original_lt.tensor, 1, keepdims=True),
         [self.a0, ('channel', ['hihowareyou']), self.a2, self.a3])
     self.assertLabeledTensorsEqual(sum_lt, golden_lt)
 
@@ -668,7 +668,7 @@ class ReduceSumTest(Base):
     sum_lt = ops.reduce_sum(self.original_lt, ('channel', 'hihowareyou'))
     golden_lt = core.LabeledTensor(
         math_ops.reduce_sum(
-            self.original_lt.tensor, 1, keep_dims=True),
+            self.original_lt.tensor, 1, keepdims=True),
         [self.a0, ('channel', ['hihowareyou']), self.a2, self.a3])
     self.assertLabeledTensorsEqual(sum_lt, golden_lt)
 
diff --git a/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py b/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py
index f701647c2b..28ddaa69a1 100644
--- a/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py
+++ b/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py
@@ -200,7 +200,7 @@ class SparseCrossOpTest(test.TestCase):
       self._assert_sparse_tensor_equals(expected_out, sess.run(op))
 
   def test_large_batch(self):
-    """Tests with large batch size to force multithreding.
+    """Tests with large batch size to force multithreading.
     """
     batch_size = 5000
     col1 = []
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index 9ccb589d69..3ae07cedab 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -48,7 +48,7 @@ you should choose depends on (1) the feature type and (2) the model type.
    recommended.
 
      embedded_dept_column = embedding_column(
-       sparse_column_with_keys("department", ["math", "philosphy", ...]),
+       sparse_column_with_keys("department", ["math", "philosophy", ...]),
        dimension=10)
 
 * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops.py b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
index 78affea44c..06060b99e7 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
@@ -815,7 +815,7 @@ class _Transformer(object):
   """
 
   def __init__(self, columns_to_tensors):
-    """Initializes transfomer.
+    """Initializes transformer.
 
     Args:
       columns_to_tensors: A mapping from feature columns to tensors. 'string'
@@ -908,7 +908,7 @@ def _gather_feature_columns(feature_columns):
 
 
 def _check_forbidden_sequence_columns(feature_columns):
-  """Recursively cecks `feature_columns` for `_FORBIDDEN_SEQUENCE_COLUMNS`."""
+  """Recursively checks `feature_columns` for `_FORBIDDEN_SEQUENCE_COLUMNS`."""
   all_feature_columns = _gather_feature_columns(feature_columns)
   for feature_column in all_feature_columns:
     if isinstance(feature_column, _FORBIDDEN_SEQUENCE_COLUMNS):
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 25c3b1e7ea..2f3e57653c 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -932,7 +932,8 @@ def convolution(inputs,
                 variables_collections=None,
                 outputs_collections=None,
                 trainable=True,
-                scope=None):
+                scope=None,
+                conv_dims=None):
   """Adds an N-D convolution followed by an optional batch_norm layer.
 
   It is required that 1 <= N <= 3.
@@ -993,6 +994,10 @@ def convolution(inputs,
     trainable: If `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     scope: Optional scope for `variable_scope`.
+    conv_dims: Optional convolution dimensionality, when set it would use the
+      corresponding convolution (e.g. 2 for Conv 2D, 3 for Conv 3D, ..). When
+      leaved to None it would select the convolution dimensionality based on
+      the input rank (i.e. Conv ND, with N = input_rank - 2).
 
   Returns:
     A tensor representing the output of the operation.
@@ -1015,6 +1020,9 @@ def convolution(inputs,
     inputs = ops.convert_to_tensor(inputs)
     input_rank = inputs.get_shape().ndims
 
+    if conv_dims is not None and conv_dims + 2 != input_rank:
+      raise ValueError('Convolution expects input with rank %d, got %d' %
+                       (conv_dims + 2, input_rank))
     if input_rank == 3:
       layer_class = convolutional_layers.Convolution1D
     elif input_rank == 4:
@@ -1061,10 +1069,134 @@ def convolution(inputs,
       outputs = activation_fn(outputs)
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
 
+@add_arg_scope
+def convolution1d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=1)
+
+convolution1d.__doc__ = convolution.__doc__
 
-convolution2d = convolution
-convolution3d = convolution
+@add_arg_scope
+def convolution2d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=2)
+
+convolution2d.__doc__ = convolution.__doc__
 
+@add_arg_scope
+def convolution3d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=3)
+
+convolution3d.__doc__ = convolution.__doc__
 
 @add_arg_scope
 def convolution2d_in_plane(
@@ -1411,7 +1543,7 @@ def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None):
   Args:
      tensor: An `int` `Tensor` to be converted to a `Sparse`.
      eos_token: An integer.
-       It is part of the target label that signfies the end of a sentence.
+       It is part of the target label that signifies the end of a sentence.
      outputs_collections: Collection to add the outputs.
      scope: Optional scope for name_scope.
   """
@@ -1555,7 +1687,7 @@ def _inner_flatten(inputs, new_rank, output_collections=None, scope=None):
     output_collections: Collection to which the outputs will be added.
     scope: Optional scope for `name_scope`.
   Returns:
-    A `Tensor` or `SparseTensor` conataining the same values as `inputs`, but
+    A `Tensor` or `SparseTensor` containing the same values as `inputs`, but
     with innermost dimensions flattened to obtain rank `new_rank`.
 
   Raises:
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 997f910a2a..b01fd5d5c9 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -310,6 +310,17 @@ class BiasAddTest(test.TestCase):
 
 class ConvolutionTest(test.TestCase):
 
+  def testInvalidShape(self):
+    with self.test_session():
+      images_2d = random_ops.random_uniform((5, 7, 9, 3), seed=1)
+      with self.assertRaisesRegexp(
+          ValueError, 'Convolution expects input with rank 5, got 4'):
+        layers_lib.convolution3d(images_2d, 32, 3)
+      images_3d = random_ops.random_uniform((5, 6, 7, 9, 3), seed=1)
+      with self.assertRaisesRegexp(
+          ValueError, 'Convolution expects input with rank 4, got 5'):
+        layers_lib.convolution2d(images_3d, 32, 3)
+
   def testInvalidDataFormat(self):
     height, width = 7, 9
     with self.test_session():
@@ -3155,7 +3166,7 @@ class RepeatTests(test.TestCase):
     with self.test_session():
       images = np.random.uniform(size=(5, height, width, 3)).astype(np.float32)
       output = _layers.repeat(images, 3, layers_lib.conv2d, 32, [3, 3])
-      self.assertEqual(output.op.name, 'Repeat/convolution_3/Relu')
+      self.assertEqual(output.op.name, 'Repeat/convolution2d_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 32])
 
   def testRepeatWithScope(self):
@@ -3749,7 +3760,7 @@ class StackTests(test.TestCase):
           layers_lib.convolution2d, [10, 20, 30],
           kernel_size=[3, 3],
           padding='SAME')
-      self.assertEqual(output.op.name, 'Stack/convolution_3/Relu')
+      self.assertEqual(output.op.name, 'Stack/convolution2d_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 30])
 
   def testStackWithScope(self):
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
index 392a490be1..8c118402a4 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
@@ -60,8 +60,8 @@ class RevBlockTest(test.TestCase):
       sess.run(variables.global_variables_initializer())
       x1, x2, x1_inv, x2_inv = sess.run([x1, x2, x1_inv, x2_inv])
 
-      self.assertAllClose(x1, x1_inv)
-      self.assertAllClose(x2, x2_inv)
+      self.assertAllClose(x1, x1_inv, atol=1e-5)
+      self.assertAllClose(x2, x2_inv, atol=1e-5)
 
   def testBackwardForward(self):
 
diff --git a/tensorflow/contrib/layers/python/layers/utils_test.py b/tensorflow/contrib/layers/python/layers/utils_test.py
index 3409860add..645dc1291e 100644
--- a/tensorflow/contrib/layers/python/layers/utils_test.py
+++ b/tensorflow/contrib/layers/python/layers/utils_test.py
@@ -294,7 +294,6 @@ class NPositiveIntegersTest(test.TestCase):
     self.assertEqual(utils.n_positive_integers(2, 2), (2, 2))
     self.assertEqual(utils.n_positive_integers(2, (2, 3)), (2, 3))
     self.assertEqual(utils.n_positive_integers(3, (2, 3, 1)), (2, 3, 1))
-    self.assertEqual(utils.n_positive_integers(3, (2, 3, 1)), (2, 3, 1))
     self.assertEqual(
         utils.n_positive_integers(3, tensor_shape.TensorShape([2, 3, 1])),
         (2, 3, 1))
diff --git a/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py b/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
index b28835a809..584556992a 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
@@ -36,7 +36,6 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index 8c85c431be..14ee2ba609 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -299,6 +299,7 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
     # so instead of breaking compatibility with that assumption, we
     # just manually initialize this field:
     self._train_distribute = None
+    self._device_fn = None
 
     gpu_options = config_pb2.GPUOptions(
         per_process_gpu_memory_fraction=gpu_memory_fraction)
diff --git a/tensorflow/contrib/lite/Makefile b/tensorflow/contrib/lite/Makefile
index b4504f246a..65fba52d46 100644
--- a/tensorflow/contrib/lite/Makefile
+++ b/tensorflow/contrib/lite/Makefile
@@ -90,7 +90,8 @@ $(wildcard tensorflow/contrib/lite/kernels/*.c) \
 $(wildcard tensorflow/contrib/lite/kernels/internal/*.c) \
 $(wildcard tensorflow/contrib/lite/kernels/internal/optimized/*.c) \
 $(wildcard tensorflow/contrib/lite/kernels/internal/reference/*.c) \
-$(wildcard tensorflow/contrib/lite/downloads/farmhash/src/farmhash.cc)
+$(wildcard tensorflow/contrib/lite/downloads/farmhash/src/farmhash.cc) \
+$(wildcard tensorflow/contrib/lite/downloads/fft2d/fftsg.c)
 # Remove any duplicates.
 CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
 CORE_CC_EXCLUDE_SRCS := \
diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/download_dependencies.sh
index a93ed201d6..436c3e1d4c 100755
--- a/tensorflow/contrib/lite/download_dependencies.sh
+++ b/tensorflow/contrib/lite/download_dependencies.sh
@@ -30,12 +30,15 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
+# the archive has been propagated in mirror.bazel.build.
+GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
 NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
 FARMHASH_URL="https://mirror.bazel.build/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz"
 FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/master.zip"
+FFT2D_URL="https://mirror.bazel.build/www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz"
 
 # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
 #                   so work around it by patching the source.
@@ -91,6 +94,7 @@ download_and_extract "${ABSL_URL}" "${DOWNLOADS_DIR}/absl"
 download_and_extract "${NEON_2_SSE_URL}" "${DOWNLOADS_DIR}/neon_2_sse"
 download_and_extract "${FARMHASH_URL}" "${DOWNLOADS_DIR}/farmhash"
 download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
+download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d"
 
 replace_by_sed 's#static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );#static uint32x4_t p4ui_CONJ_XOR; // = vld1q_u32( conj_XOR_DATA ); - Removed by script#' \
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
diff --git a/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj b/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
index b0236e9c60..98d3b5bb8a 100644
--- a/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
+++ b/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
@@ -326,10 +326,6 @@
 				GCC_WARN_UNUSED_VARIABLE = YES;
 				HEADER_SEARCH_PATHS = (
 					"$(inherited)",
-					../../../../../../,
-					../../../downloads/flatbuffers/include/,
-					../../../downloads/eigen/,
-					../../../downloads/,
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
 				MTL_ENABLE_DEBUG_INFO = YES;
@@ -373,10 +369,6 @@
 				GCC_WARN_UNUSED_VARIABLE = YES;
 				HEADER_SEARCH_PATHS = (
 					"$(inherited)",
-					../../../../../../,
-					../../../downloads/flatbuffers/include/,
-					../../../downloads/eigen/,
-					../../../downloads/,
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
diff --git a/tensorflow/contrib/lite/g3doc/apis.md b/tensorflow/contrib/lite/g3doc/apis.md
index fe208e47d1..50cc146a87 100644
--- a/tensorflow/contrib/lite/g3doc/apis.md
+++ b/tensorflow/contrib/lite/g3doc/apis.md
@@ -29,7 +29,7 @@ interpreter->AllocateTensors();
 float* input = interpreter->typed_input_tensor<float>(0);
 // Fill `input`.
 interpreter->Invoke();
-float* output = interpreter->type_output_tensor<float>(0);
+float* output = interpreter->typed_output_tensor<float>(0);
 ```
 ### Data Alignment
 
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
index 300786c3ca..18f6465188 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
@@ -54,6 +54,9 @@ import android.view.Surface;
 import android.view.TextureView;
 import android.view.View;
 import android.view.ViewGroup;
+import android.widget.CompoundButton;
+import android.widget.NumberPicker;
+import android.widget.ToggleButton;
 import android.widget.TextView;
 import android.widget.Toast;
 import java.io.IOException;
@@ -82,6 +85,8 @@ public class Camera2BasicFragment extends Fragment
   private boolean runClassifier = false;
   private boolean checkedPermissions = false;
   private TextView textView;
+  private ToggleButton toggle;
+  private NumberPicker np;
   private ImageClassifier classifier;
 
   /** Max preview width that is guaranteed by Camera2 API */
@@ -289,6 +294,24 @@ public class Camera2BasicFragment extends Fragment
   public void onViewCreated(final View view, Bundle savedInstanceState) {
     textureView = (AutoFitTextureView) view.findViewById(R.id.texture);
     textView = (TextView) view.findViewById(R.id.text);
+    toggle = (ToggleButton) view.findViewById(R.id.button);
+
+    toggle.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+      public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
+        classifier.setUseNNAPI(isChecked);
+      }
+    });
+
+    np = (NumberPicker) view.findViewById(R.id.np);
+    np.setMinValue(1);
+    np.setMaxValue(10);
+    np.setWrapSelectorWheel(true);
+    np.setOnValueChangedListener(new NumberPicker.OnValueChangeListener() {
+      @Override
+      public void onValueChange(NumberPicker picker, int oldVal, int newVal){
+        classifier.setNumThreads(newVal);
+      }
+    });
   }
 
   /** Load the model and labels. */
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
index c57bb348c5..d32c077910 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
@@ -142,6 +142,16 @@ public abstract class ImageClassifier {
     }
   }
 
+  public void setUseNNAPI(Boolean nnapi) {
+    if (tflite != null)
+        tflite.setUseNNAPI(nnapi);
+  }
+
+  public void setNumThreads(int num_threads) {
+    if (tflite != null)
+        tflite.setNumThreads(num_threads);
+  }
+
   /** Closes tflite to release resources. */
   public void close() {
     tflite.close();
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
index 15305c436e..db557ad62f 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
@@ -22,24 +22,59 @@
         android:layout_width="wrap_content"
         android:layout_height="wrap_content"
         android:layout_alignParentStart="true"
+        android:layout_alignParentLeft="true"
         android:layout_alignParentTop="true" />
 
     <FrameLayout
         android:id="@+id/control"
         android:layout_width="match_parent"
-        android:layout_height="112dp"
+        android:layout_height="135dp"
         android:layout_alignParentBottom="true"
         android:layout_alignParentStart="true"
+        android:layout_alignParentLeft="true"
+        android:layout_alignParentEnd="true"
+        android:layout_alignParentRight="true"
+        android:layout_marginEnd="150dp"
+        android:layout_marginRight="150dp"
         android:background="@color/control_background">
 
-        <TextView android:id="@+id/text"
+        <TextView
+            android:id="@+id/text"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
-            android:paddingLeft="80dp"
+            android:paddingLeft="20dp"
             android:textColor="#FFF"
             android:textSize="20sp"
             android:textStyle="bold" />
 
     </FrameLayout>
 
+    <RelativeLayout
+        android:id="@+id/control2"
+        android:layout_width="match_parent"
+        android:layout_height="135dp"
+        android:layout_alignParentLeft="true"
+        android:layout_alignParentStart="true"
+        android:layout_alignTop="@+id/control"
+        android:layout_marginLeft="300dp"
+        android:layout_marginStart="300dp"
+        android:background="@color/control_background">
+
+        <ToggleButton
+            android:id="@+id/button"
+            android:textOff="@string/tflite"
+            android:textOn="@string/nnapi"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_alignParentLeft="true"
+            android:layout_alignParentStart="true" />
+
+        <NumberPicker
+            android:id="@+id/np"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_below="@+id/button"
+            android:visibility="visible" />
+    </RelativeLayout>
+
 </RelativeLayout>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml
index a08ec3eb62..29a033bcd4 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml
@@ -21,4 +21,6 @@
     <string name="toggle_turn_on">NN:On</string>
     <string name="toggle_turn_off">NN:Off</string>
     <string name="toggle">Use NNAPI</string>
+    <string name="tflite">tflite</string>
+    <string name="nnapi">NNAPI</string>
 </resources>
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index e915e65aa1..e84ee71129 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -215,6 +215,13 @@ public final class Interpreter implements AutoCloseable {
     }
   }
 
+  public void setNumThreads(int num_threads) {
+    if (wrapper == null) {
+      throw new IllegalStateException("The interpreter has already been closed.");
+    }
+    wrapper.setNumThreads(num_threads);
+  }
+
   /** Release resources associated with the {@code Interpreter}. */
   @Override
   public void close() {
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index dfc8ac111a..2fc803715b 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -153,6 +153,10 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     useNNAPI(interpreterHandle, useNNAPI);
   }
 
+  void setNumThreads(int num_threads) {
+    numThreads(interpreterHandle, num_threads);
+  }
+
   /** Gets index of an input given its name. */
   int getInputIndex(String name) {
     if (inputsIndexes == null) {
@@ -324,6 +328,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native void useNNAPI(long interpreterHandle, boolean state);
 
+  private static native void numThreads(long interpreterHandle, int num_threads);
+
   private static native long createErrorReporter(int size);
 
   private static native long createModel(String modelPathOrBuffer, long errorHandle);
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index ccfdfd829b..45f510da1d 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -320,6 +320,16 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_useNNAPI(JNIEnv* env,
   interpreter->UseNNAPI(static_cast<bool>(state));
 }
 
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_numThreads(JNIEnv* env,
+                                                           jclass clazz,
+                                                           jlong handle,
+                                                           jint num_threads) {
+  tflite::Interpreter* interpreter = convertLongToInterpreter(env, handle);
+  if (interpreter == nullptr) return;
+  interpreter->SetNumThreads(static_cast<int>(num_threads));
+}
+
 JNIEXPORT jlong JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_createErrorReporter(
     JNIEnv* env, jclass clazz, jint size) {
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
index 0e28a77fee..eaa765cb34 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
@@ -61,7 +61,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputNames(JNIEnv* env,
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
  *  Method:
- *  Signature: (JZ)
+ *  Signature: (JZ)V
  */
 JNIEXPORT void JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_useNNAPI(JNIEnv* env,
@@ -69,6 +69,16 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_useNNAPI(JNIEnv* env,
                                                            jlong handle,
                                                            jboolean state);
 
+/*
+ *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
+ *  Method:
+ *  Signature: (JI)V
+ */
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_numThreads(JNIEnv* env,
+                                                           jclass clazz,
+                                                           jlong handle,
+                                                           jint num_threads);
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
  *  Method:
diff --git a/tensorflow/contrib/lite/kernels/add.cc b/tensorflow/contrib/lite/kernels/add.cc
index 63ea89df56..e0aa070e2d 100644
--- a/tensorflow/contrib/lite/kernels/add.cc
+++ b/tensorflow/contrib/lite/kernels/add.cc
@@ -176,7 +176,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                   output);
   } else {
     context->ReportError(context,
-                         "Inputs and outputs not all float|unit8 types.");
+                         "Inputs and outputs not all float|uint8 types.");
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/div.cc b/tensorflow/contrib/lite/kernels/div.cc
index 6dd243ad62..ec380c8e49 100644
--- a/tensorflow/contrib/lite/kernels/div.cc
+++ b/tensorflow/contrib/lite/kernels/div.cc
@@ -106,6 +106,8 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
 #undef TF_LITE_DIV
 }
 
+
+
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteDivParams*>(node->builtin_data);
@@ -118,7 +120,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   if (output->type == kTfLiteFloat32) {
     EvalFloat<kernel_type>(context, node, params, data, input1, input2, output);
   } else {
-    context->ReportError(context, "Inputs and outputs not all float types.");
+    context->ReportError(context,
+                         "Div only supports FLOAT32 and quantized UINT8 now.");
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index d585bcca0e..9e9aba0169 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -4374,7 +4374,7 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
   using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
   using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
 
-  gemmlowp::ScopedProfilingLabel label("Softmax/8bit");
+gemmlowp::ScopedProfilingLabel label("Softmax/8bit");
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index ae295cc8b5..4c8cbe4275 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -1403,6 +1403,33 @@ inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
                output_data, output_dims);
 }
 
+inline void Div(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float output_activation_min, float output_activation_max,
+                float* output_data, const Dims<4>& output_dims) {
+  const int batches =
+      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
+  const int height =
+      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
+  const int width =
+      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
+  const int depth =
+      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[Offset(input1_dims, c, x, y, b)] /
+                      input2_data[Offset(input2_dims, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
@@ -1444,18 +1471,6 @@ void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
   }
 }
 
-inline void Div(const float* input1_data, const Dims<4>& input1_dims,
-                const float* input2_data, const Dims<4>& input2_dims,
-                float output_activation_min, float output_activation_max,
-                float* output_data, const Dims<4>& output_dims) {
-  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] / input2_data[i], output_activation_min,
-        output_activation_max);
-  }
-}
-
 inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
diff --git a/tensorflow/contrib/lite/kernels/sub.cc b/tensorflow/contrib/lite/kernels/sub.cc
index 66b06aeaec..7c60a4fdbf 100644
--- a/tensorflow/contrib/lite/kernels/sub.cc
+++ b/tensorflow/contrib/lite/kernels/sub.cc
@@ -174,7 +174,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     EvalQuantized<kernel_type>(context, node, params, data, input1, input2,
                                output);
   } else {
-    context->ReportError(context, "Inputs and outputs not all float types.");
+    context->ReportError(context,
+                         "Inputs and outputs not all float|uint8 types.");
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
index 477e7f13da..38e0005890 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
@@ -32,7 +32,7 @@ bool ResolveTensorFlowMerge::Run(Model* model, std::size_t op_index) {
   }
 
   // We need to yield until this Merge node has only 1 input, which will mean
-  // that that is the selected input. Other graph transformations on other nodes
+  // that is the selected input. Other graph transformations on other nodes
   // such as ResolveTensorFlowSwitch, will take care of trimming the
   // non-selected inputs, so that at some point there will be only 1 input left.
   if (merge_op->inputs.size() > 1) {
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 705a9d69a6..482cc71d8b 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -152,9 +152,9 @@ enum class AxesOrder {
 };
 
 // The type of the scalars in an array.
-// Note that that does not by itself tell whether the values in the array are
-// real (are literally interpreted as real numbers) or quantized (only acquire
-// a meaning as real numbers in conjunction with QuantizationParams).
+// Note that the type does not by itself tell whether the values in the array
+// are real (are literally interpreted as real numbers) or quantized (only
+// acquire a meaning as real numbers in conjunction with QuantizationParams).
 //
 // In practice though:
 //   float values are always real
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index 8c3a8afe7a..bdad34a665 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.deprecation import deprecated_argument_lookup
 
 __all__ = [
     "absolute_difference", "add_loss", "cosine_distance",
@@ -651,11 +652,9 @@ def cosine_distance(predictions,
     ValueError: If `predictions` shape doesn't match `labels` shape, or
       `weights` is `None`.
   """
-  if dim is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'axis' and 'dim'")
-    axis = dim
-  if axis is None and dim is None:
+  axis = deprecated_argument_lookup(
+      "axis", axis, "dim", dim)
+  if axis is None:
     raise ValueError("You must specify 'axis'.")
   with ops.name_scope(scope, "cosine_distance_loss",
                       [predictions, labels, weights]) as scope:
diff --git a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
index 2b9eee4ef7..de76acb51f 100644
--- a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
+++ b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
@@ -711,7 +711,7 @@ def _find_loss_augmented_facility_idx(pairwise_distances, labels, chosen_ids,
       candidate_scores, margin_multiplier * nmi_scores)
 
   argmax_index = math_ops.to_int32(
-      math_ops.argmax(candidate_scores, dimension=0))
+      math_ops.argmax(candidate_scores, axis=0))
 
   return candidate_ids[argmax_index]
 
@@ -811,7 +811,7 @@ def update_medoid_per_cluster(pairwise_distances, pairwise_distances_subset,
   candidate_scores = math_ops.add(scores_fac, margin_multiplier * scores_margin)
 
   argmax_index = math_ops.to_int32(
-      math_ops.argmax(candidate_scores, dimension=0))
+      math_ops.argmax(candidate_scores, axis=0))
 
   best_medoid = math_ops.to_int32(cluster_member_ids[argmax_index])
   chosen_ids = update_1d_tensor(chosen_ids, cluster_idx, best_medoid)
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 48953e2e38..eff9081e35 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -27,7 +27,9 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
+# the archive has been propagated in mirror.bazel.build.
+GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
diff --git a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
index 4090c1ff3e..f37a2593e2 100644
--- a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
+++ b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
@@ -348,7 +348,7 @@ def _freeze_graph_with_def_protos(input_graph_def, output_node_names,
                                   input_saver_def, input_checkpoint):
   """Converts all variables in a graph and checkpoint into constants.
 
-  During this process, we need to retain certain initialzer nodes (e.g. table
+  During this process, we need to retain certain initializer nodes (e.g. table
   initializer nodes). Instead of determining which dependencies
   of the shared initializer node (e.g. group_deps) to keep, we
   reconstruct the connections between the individual initializer nodes and
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 5364e3075d..00a933e5e0 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -2834,7 +2834,9 @@ def streaming_sparse_average_precision_at_top_k(top_k_predictions,
       name=name)
 
 
-@deprecated(None, 'Please switch to tf.metrics.mean.')
+@deprecated(None,
+            'Please switch to tf.metrics.mean_absolute_error. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_mean_absolute_error(predictions,
                                   labels,
                                   weights=None,
@@ -2953,7 +2955,9 @@ def streaming_mean_relative_error(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None,
+            'Please switch to tf.metrics.mean_squared_error. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_mean_squared_error(predictions,
                                  labels,
                                  weights=None,
@@ -3011,7 +3015,10 @@ def streaming_mean_squared_error(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(
+    None,
+    'Please switch to tf.metrics.root_mean_squared_error. Note that the '
+    'order of the labels and predictions arguments has been switched.')
 def streaming_root_mean_squared_error(predictions,
                                       labels,
                                       weights=None,
@@ -3351,7 +3358,7 @@ def streaming_mean_cosine_distance(predictions,
   radial_diffs = math_ops.reduce_sum(
       radial_diffs, reduction_indices=[
           dim,
-      ], keep_dims=True)
+      ], keepdims=True)
   mean_distance, update_op = streaming_mean(radial_diffs, weights, None, None,
                                             name or 'mean_cosine_distance')
   mean_distance = math_ops.subtract(1.0, mean_distance)
diff --git a/tensorflow/contrib/nn/python/ops/sampling_ops.py b/tensorflow/contrib/nn/python/ops/sampling_ops.py
index 63fc487dca..e65925610c 100644
--- a/tensorflow/contrib/nn/python/ops/sampling_ops.py
+++ b/tensorflow/contrib/nn/python/ops/sampling_ops.py
@@ -88,7 +88,7 @@ def _rank_resample(weights, biases, inputs, sampled_values, num_resampled,
     return math_ops.reduce_logsumexp(
         math_ops.matmul(embeddings, reweighted_inputs, transpose_b=True),
         axis=1,
-        keep_dims=False)
+        keepdims=False)
 
   # Calling this protected form of embedding_lookup allows co-locating
   # the logsumexp computation with the partitioned weights, which yields
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index c57c5e3f29..612ecc3e63 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -14,6 +14,7 @@ py_library(
     name = "opt_py",
     srcs = [
         "__init__.py",
+        "python/training/adamax.py",
         "python/training/addsign.py",
         "python/training/drop_stale_gradient_optimizer.py",
         "python/training/elastic_average_optimizer.py",
@@ -43,11 +44,27 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
 
+py_test(
+    name = "adamax_test",
+    srcs = ["python/training/adamax_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "external_optimizer_test",
     srcs = ["python/training/external_optimizer_test.py"],
diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py
index 6c1bb1adc0..4c13c8e247 100644
--- a/tensorflow/contrib/opt/__init__.py
+++ b/tensorflow/contrib/opt/__init__.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import
+from tensorflow.contrib.opt.python.training.adamax import *
 from tensorflow.contrib.opt.python.training.addsign import *
 from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import *
 from tensorflow.contrib.opt.python.training.external_optimizer import *
@@ -36,6 +37,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 
 _allowed_symbols = [
+    'AdaMaxOptimizer',
     'PowerSignOptimizer',
     'AddSignOptimizer',
     'DelayCompensatedGradientDescentOptimizer',
diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py
new file mode 100644
index 0000000000..686bac0d84
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/adamax.py
@@ -0,0 +1,191 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""AdaMax for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import adam
+from tensorflow.python.training import training_ops
+
+
+class AdaMaxOptimizer(adam.AdamOptimizer):
+  """Optimizer that implements the AdaMax algorithm.
+
+  Adamax is sometimes superior to adam, specially in models with embeddings,
+  see [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+  ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+  """
+
+  def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
+               use_locking=False, name="AdaMax"):
+    """Construct a new AdaMax optimizer.
+
+    Initialization:
+
+    ```
+    m_0 <- 0 (Initialize initial 1st moment vector)
+    v_0 <- 0 (Initialize the exponentially weighted infinity norm)
+    t <- 0 (Initialize timestep)
+    ```
+
+    The update rule for `variable` with gradient `g` uses an optimization
+    described at the end of section 7.1 of the paper:
+
+    ```
+    t <- t + 1
+
+    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+    v_t <- max(beta2 * v_{t-1}, abs(g))
+    variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+    ```
+
+    Similar to AdamOptimizer, the epsilon is added for numerical stability
+    (especially to get rid of division by zero when v_t = 0).
+
+    Contrast to AdamOptimizer, the sparse implementation of this algorithm
+    (used when the gradient is an IndexedSlices object, typically because of
+    `tf.gather` or an embedding lookup in the forward pass) only updates
+    variable slices and corresponding `m_t`, `v_t` terms when that part of
+    the variable was used in the forward pass. This means that the sparse
+    behavior is contrast to the dense behavior (similar to some momentum
+    implementations which ignore momentum unless a variable slice was actually
+    used).
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      beta1: A float value or a constant float tensor.
+        The exponential decay rate for the 1st moment estimates.
+      beta2: A float value or a constant float tensor.
+        The exponential decay rate for the exponentially weighted infinity norm.
+      epsilon: A small constant for numerical stability.
+      use_locking: If True use locks for update operations.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "AdaMax".
+    """
+    super(AdaMaxOptimizer, self).__init__(learning_rate, beta1, beta2,
+                                          epsilon, use_locking, name)
+
+  def _get_beta_accumulators(self):
+    if context.executing_eagerly():
+      graph = None
+    else:
+      graph = ops.get_default_graph()
+    return self._get_non_slot_variable("beta1_power", graph=graph)
+
+  def _create_slots(self, var_list):
+    # Create the beta1 accumulators on the same device as the first
+    # variable. Sort the var_list to make sure this device is consistent across
+    # workers (these need to go on the same PS, otherwise some updates are
+    # silently ignored).
+    first_var = min(var_list, key=lambda x: x.name)
+    self._create_non_slot_variable(initial_value=self._beta1,
+                                   name="beta1_power",
+                                   colocate_with=first_var)
+
+    # Create slots for the first and second moments.
+    for v in var_list:
+      self._zeros_slot(v, "m", self._name)
+      self._zeros_slot(v, "v", self._name)
+
+  def _apply_dense(self, grad, var):
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
+    beta1_power = self._get_beta_accumulators()
+    return training_ops.apply_ada_max(
+        var, m, v,
+        math_ops.cast(beta1_power, var.dtype.base_dtype),
+        math_ops.cast(self._lr_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, var.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
+        grad, use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, var):
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
+    beta1_power = self._get_beta_accumulators()
+    return training_ops.resource_apply_ada_max(
+        var.handle, m.handle, v.handle,
+        math_ops.cast(beta1_power, grad.dtype.base_dtype),
+        math_ops.cast(self._lr_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
+        grad, use_locking=self._use_locking)
+
+  def _apply_sparse_shared(self, grad, var, indices,
+                           scatter_add, scatter_update):
+    beta1_power = self._get_beta_accumulators()
+    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
+    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
+    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, "m")
+    m_slice = array_ops.gather(m, indices)
+    m_t_slice = m_slice * beta1_t + grad * (1 - beta1_t)
+    with ops.control_dependencies([m_t_slice]):
+      m_t = scatter_update(m, indices, m_t_slice)
+    # u_t = max(beta2 * u, abs(g_t))
+    v = self.get_slot(var, "v")
+    v_slice = array_ops.gather(v, indices)
+    v_t_slice = math_ops.maximum(v_slice * beta2_t, math_ops.abs(grad))
+    with ops.control_dependencies([v_t_slice]):
+      v_t = scatter_update(v, indices, v_t_slice)
+    # theta_t = theta - lr / (1 - beta1^t) * m_t / u_t
+    var_slice = -lr_t / (1 - beta1_power) * (m_t_slice /
+                                             (v_t_slice + epsilon_t))
+    with ops.control_dependencies([var_slice]):
+      var_update = scatter_add(var, indices, var_slice)
+    return control_flow_ops.group(*[var_update, m_t, v_t])
+
+  def _apply_sparse(self, grad, var):
+    return self._apply_sparse_shared(
+        grad.values, var, grad.indices,
+        lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
+            x, i, v, use_locking=self._use_locking),
+        lambda x, i, v: state_ops.scatter_update(  # pylint: disable=g-long-lambda
+            x, i, v, use_locking=self._use_locking))
+
+  def _resource_scatter_update(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_update(
+            x.handle, i, v)]):
+      return x.value()
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    return self._apply_sparse_shared(
+        grad, var, indices,
+        self._resource_scatter_add, self._resource_scatter_update)
+
+  def _finish(self, update_ops, name_scope):
+    # Update the power accumulators.
+    with ops.control_dependencies(update_ops):
+      beta1_power = self._get_beta_accumulators()
+      with ops.colocate_with(beta1_power):
+        update_beta1 = beta1_power.assign(
+            beta1_power * self._beta1_t, use_locking=self._use_locking)
+    return control_flow_ops.group(*update_ops + [update_beta1],
+                                  name=name_scope)
diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
new file mode 100644
index 0000000000..bc92a7006f
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -0,0 +1,348 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for AdaMax."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import adamax
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adamax_update_numpy(param,
+                        g_t,
+                        t,
+                        m,
+                        v,
+                        alpha=0.001,
+                        beta1=0.9,
+                        beta2=0.999,
+                        epsilon=1e-8):
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = np.maximum(beta2 * v, np.abs(g_t))
+  param_t = param - (alpha / (1 - beta1**t)) * (m_t / (v_t + epsilon))
+  return param_t, m_t, v_t
+
+
+def adamax_sparse_update_numpy(param,
+                               indices,
+                               g_t,
+                               t,
+                               m,
+                               v,
+                               alpha=0.001,
+                               beta1=0.9,
+                               beta2=0.999,
+                               epsilon=1e-8):
+  m_t, v_t, param_t = np.copy(m), np.copy(v), np.copy(param)
+  m_t_slice = beta1 * m[indices] + (1 - beta1) * g_t
+  v_t_slice = np.maximum(beta2 * v[indices], np.abs(g_t))
+  param_t_slice = param[indices] - ((alpha / (1 - beta1**t)) *
+                                    (m_t_slice / (v_t_slice + epsilon)))
+  m_t[indices] = m_t_slice
+  v_t[indices] = v_t_slice
+  param_t[indices] = param_t_slice
+  return param_t, m_t, v_t
+
+
+class AdaMaxOptimizerTest(test.TestCase):
+
+  def doTestSparse(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        zero_slots = lambda: np.zeros((3), dtype=dtype.as_numpy_dtype)
+        m0, v0, m1, v1 = zero_slots(), zero_slots(), zero_slots(), zero_slots()
+        var0_np = np.array([1.0, 2.0, 3.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([4.0, 5.0, 6.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
+        grads1_np_indices = np.array([2, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
+        opt = adamax.AdaMaxOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0, 3.0], var0.eval())
+        self.assertAllClose([4.0, 5.0, 6.0], var1.eval())
+
+        beta1_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of AdaMax
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adamax_sparse_update_numpy(
+              var0_np, grads0_np_indices, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_sparse_update_numpy(
+              var1_np, grads1_np_indices, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSparse(self):
+    self.doTestSparse(use_resource=False)
+
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  def testSparseDevicePlacement(self):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        var = variables.Variable([[1.0], [2.0]])
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = adamax.AdaMaxOptimizer(3.0)
+        minimize_op = optimizer.minimize(gathered_sum)
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adamax.AdaMaxOptimizer().apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adamax.AdaMaxOptimizer().apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  def doTestBasic(self, use_resource=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.test_session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(
+              var0_np, name="var0_%d" % i)
+          var1 = resource_variable_ops.ResourceVariable(
+              var1_np, name="var1_%d" % i)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        opt = adamax.AdaMaxOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        opt_variables = opt.variables()
+        beta1_power = opt._get_beta_accumulators()
+        self.assertTrue(beta1_power is not None)
+        self.assertIn(beta1_power, opt_variables)
+
+        with ops.Graph().as_default():
+          # Shouldn't return non-slot variables from other graphs.
+          self.assertEqual(0, len(opt.variables()))
+
+        if not context.executing_eagerly():
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        beta1_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of AdaMax
+        for t in range(1, 4):
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          elif t > 1:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta1_power))
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          if use_resource:
+            self.assertEqual("var0_%d/AdaMax:0" % (i,),
+                             opt.get_slot(var=var0, name="m").name)
+
+  def testBasic(self):
+    with self.test_session():
+      self.doTestBasic(use_resource=False)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adamax.AdaMaxOptimizer(constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of AdaMax
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adamax.AdaMaxOptimizer()
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        beta1_power = opt._get_beta_accumulators()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 3 steps of intertwined AdaMax1 and AdaMax2.
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testTwoSessions(self):
+    optimizer = adamax.AdaMaxOptimizer()
+    g = ops.Graph()
+    with g.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+        optimizer.apply_gradients([(grads0, var0)])
+
+    gg = ops.Graph()
+    with gg.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+
+        # If the optimizer saves any state not keyed by graph the following line
+        # fails.
+        optimizer.apply_gradients([(grads0, var0)])
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = adamax.AdaMaxOptimizer(1.)
+      opt.minimize(lambda: v1 + v2)
+      # There should be two non-slot variables, and two unique slot variables
+      # for v1 and v2 respectively.
+      self.assertEqual(5, len(set(opt.variables())))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
index 85e3e8d379..ac04ad9911 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
@@ -85,7 +85,7 @@ class MovingAverageOptimizerTest(test.TestCase):
               state_ops.assign_add(ema_var1, [4.0, 4.0])
           ])
 
-          # Test taht saver with missing ema variables will fail.
+          # Test that saver with missing ema variables will fail.
           with self.assertRaisesRegexp(ValueError, r'Variable to swap'):
             opt.swapping_saver(var_list=[var0])
 
@@ -123,7 +123,7 @@ class MovingAverageOptimizerTest(test.TestCase):
             self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
             self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval())
             self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval())
-            # Restore back to previou state.
+            # Restore back to previous state.
             train_saver.restore(sess, save_path)
 
           # If updates are parallel, this is not always true after the 1st step.
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
index 6ade4ccd52..8ac9b58145 100644
--- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -456,7 +456,7 @@ class CheckpointingTests(test.TestCase):
         optimizer.apply_gradients(
             [(g, v) for g, v in zip(grad, model.vars)])
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testDeferredSlotRestoration(self):
     checkpoint_directory = self.get_temp_dir()
 
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index dcb5bb6416..46bfbb729f 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -564,7 +564,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
   ### State
 
-  Internal methods apre passed a `state` argument with the correct
+  Internal methods are passed a `state` argument with the correct
   values to use for the slot and non-slot variables, and the hyper
   parameters.
   """
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index 4a8f8a04cc..aa0ef64308 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -545,7 +545,7 @@ def _GetBatchNormParams(graph, context, has_scaling):
         gamma_tensor = graph.get_tensor_by_name(op.name + ':0')
 
   if not has_scaling:
-    gamma_tensor = array_ops.ones(batch_mean_tensor.shape)
+    gamma_tensor = array_ops.ones(moving_mean_tensor.shape)
 
   return _BatchNormMatch(
       layer_op=None,
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index 0232103c41..cd162bae25 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -30,6 +30,7 @@ from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
 from tensorflow.contrib.seq2seq.python.ops import basic_decoder
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.layers import core as layers_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -110,7 +111,12 @@ class AttentionWrapperTest(test.TestCase):
                          alignment_history=False,
                          expected_final_alignment_history=None,
                          attention_layer_size=6,
+                         attention_layer=None,
                          name=''):
+    attention_layer_sizes = (
+        [attention_layer_size] if attention_layer_size is not None else None)
+    attention_layers = (
+        [attention_layer] if attention_layer is not None else None)
     self._testWithMaybeMultiAttention(
         is_multi=False,
         create_attention_mechanisms=[create_attention_mechanism],
@@ -119,7 +125,8 @@ class AttentionWrapperTest(test.TestCase):
         attention_mechanism_depths=[attention_mechanism_depth],
         alignment_history=alignment_history,
         expected_final_alignment_history=expected_final_alignment_history,
-        attention_layer_sizes=[attention_layer_size],
+        attention_layer_sizes=attention_layer_sizes,
+        attention_layers=attention_layers,
         name=name)
 
   def _testWithMaybeMultiAttention(self,
@@ -131,6 +138,7 @@ class AttentionWrapperTest(test.TestCase):
                                    alignment_history=False,
                                    expected_final_alignment_history=None,
                                    attention_layer_sizes=None,
+                                   attention_layers=None,
                                    name=''):
     # Allow is_multi to be True with a single mechanism to enable test for
     # passing in a single mechanism in a list.
@@ -144,12 +152,18 @@ class AttentionWrapperTest(test.TestCase):
     encoder_output_depth = 10
     cell_depth = 9
 
-    if attention_layer_sizes is None:
-      attention_depth = encoder_output_depth * len(create_attention_mechanisms)
-    else:
+    if attention_layer_sizes is not None:
       # Compute sum of attention_layer_sizes. Use encoder_output_depth if None.
       attention_depth = sum([attention_layer_size or encoder_output_depth
                              for attention_layer_size in attention_layer_sizes])
+    elif attention_layers is not None:
+      # Compute sum of attention_layers output depth.
+      attention_depth = sum(
+          attention_layer.compute_output_shape(
+              [batch_size, cell_depth + encoder_output_depth])[-1].value
+          for attention_layer in attention_layers)
+    else:
+      attention_depth = encoder_output_depth * len(create_attention_mechanisms)
 
     decoder_inputs = array_ops.placeholder_with_default(
         np.random.randn(batch_size, decoder_max_time,
@@ -171,13 +185,20 @@ class AttentionWrapperTest(test.TestCase):
       with vs.variable_scope(
           'root',
           initializer=init_ops.random_normal_initializer(stddev=0.01, seed=3)):
+        attention_layer_size = attention_layer_sizes
+        attention_layer = attention_layers
+        if not is_multi:
+          if attention_layer_size is not None:
+            attention_layer_size = attention_layer_size[0]
+          if attention_layer is not None:
+            attention_layer = attention_layer[0]
         cell = rnn_cell.LSTMCell(cell_depth)
         cell = wrapper.AttentionWrapper(
             cell,
             attention_mechanisms if is_multi else attention_mechanisms[0],
-            attention_layer_size=(attention_layer_sizes if is_multi
-                                  else attention_layer_sizes[0]),
-            alignment_history=alignment_history)
+            attention_layer_size=attention_layer_size,
+            alignment_history=alignment_history,
+            attention_layer=attention_layer)
         helper = helper_py.TrainingHelper(decoder_inputs,
                                           decoder_sequence_length)
         my_decoder = basic_decoder.BasicDecoder(
@@ -260,6 +281,41 @@ class AttentionWrapperTest(test.TestCase):
             expected_final_alignment_history,
             final_alignment_history_info)
 
+  def testBahdanauNormalizedDType(self):
+    for dtype in [np.float16, np.float32, np.float64]:
+      num_units = 128
+      encoder_outputs = array_ops.placeholder(dtype, shape=[64, None, 256])
+      encoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
+      decoder_inputs = array_ops.placeholder(dtype, shape=[64, None, 128])
+      decoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
+      batch_size = 64
+      attention_mechanism = wrapper.BahdanauAttention(
+          num_units=num_units,
+          memory=encoder_outputs,
+          memory_sequence_length=encoder_sequence_length,
+          normalize=True,
+          dtype=dtype,
+      )
+      cell = rnn_cell.LSTMCell(num_units)
+      cell = wrapper.AttentionWrapper(cell, attention_mechanism)
+
+      helper = helper_py.TrainingHelper(decoder_inputs,
+                                        decoder_sequence_length)
+      my_decoder = basic_decoder.BasicDecoder(
+          cell=cell,
+          helper=helper,
+          initial_state=cell.zero_state(
+              dtype=dtype, batch_size=batch_size))
+
+      final_outputs, final_state, _ = decoder.dynamic_decode(my_decoder)
+      self.assertTrue(
+          isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual(final_outputs.rnn_output.dtype, dtype)
+      self.assertTrue(
+          isinstance(final_state, wrapper.AttentionWrapperState))
+      self.assertTrue(
+          isinstance(final_state.cell_state, rnn_cell.LSTMStateTuple))
+
   def testBahdanauNotNormalized(self):
     create_attention_mechanism = wrapper.BahdanauAttention
 
@@ -797,6 +853,48 @@ class AttentionWrapperTest(test.TestCase):
         expected_final_alignment_history=expected_final_alignment_history,
         name='testMultiAttention')
 
+  def testMultiAttentionWithLayerInstances(self):
+    create_attention_mechanisms = (
+        wrapper.BahdanauAttention, wrapper.LuongAttention)
+
+    expected_final_output = BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 7), dtype=dtype('float32'), mean=0.0011709079),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=3.2000000000000002))
+    expected_final_state = AttentionWrapperState(
+        cell_state=LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0038725811),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019329828)),
+        attention=ResultSummary(
+            shape=(5, 7), dtype=dtype('float32'), mean=0.001174294),
+        time=3,
+        alignments=(
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
+        attention_state=(
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
+        alignment_history=())
+
+    expected_final_alignment_history = (
+        ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125),
+        ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125))
+
+    self._testWithMaybeMultiAttention(
+        True,
+        create_attention_mechanisms,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depths=[9, 9],
+        attention_layers=[layers_core.Dense(3, use_bias=False),
+                          layers_core.Dense(4, use_bias=False)],
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        name='testMultiAttention')
+
   def testLuongMonotonicHard(self):
     # Run attention mechanism with mode='hard', make sure probabilities are hard
     b, t, u, d = 10, 20, 30, 40
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 8a40a7ab53..1c9d179e3c 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -472,7 +472,8 @@ def _bahdanau_score(processed_query, keys, normalize):
     # Scalar used in weight normalization
     g = variable_scope.get_variable(
         "attention_g", dtype=dtype,
-        initializer=math.sqrt((1. / num_units)))
+        initializer=init_ops.constant_initializer(math.sqrt((1. / num_units))),
+        shape=())
     # Bias added prior to the nonlinearity
     b = variable_scope.get_variable(
         "attention_b", [num_units], dtype=dtype,
@@ -1082,7 +1083,8 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
                cell_input_fn=None,
                output_attention=True,
                initial_cell_state=None,
-               name=None):
+               name=None,
+               attention_layer=None):
     """Construct the `AttentionWrapper`.
 
     **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
@@ -1125,7 +1127,8 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         (default), use the context as attention at each time step. Otherwise,
         feed the context and cell output into the attention layer to generate
         attention at each time step. If attention_mechanism is a list,
-        attention_layer_size must be a list of the same length.
+        attention_layer_size must be a list of the same length. If
+        attention_layer is set, this must be None.
       alignment_history: Python boolean, whether to store alignment history
         from all time steps in the final output state (currently stored as a
         time major `TensorArray` on which you must call `stack()`).
@@ -1145,12 +1148,19 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         does not match the batch size of `initial_cell_state`, proper
         behavior is not guaranteed.
       name: Name to use when creating ops.
+      attention_layer: A list of `tf.layers.Layer` instances or a
+        single `tf.layers.Layer` instance taking the context and cell output as
+        inputs to generate attention at each time step. If None (default), use
+        the context as attention at each time step. If attention_mechanism is a
+        list, attention_layer must be a list of the same length. If
+        attention_layers_size is set, this must be None.
 
     Raises:
       TypeError: `attention_layer_size` is not None and (`attention_mechanism`
         is a list but `attention_layer_size` is not; or vice versa).
       ValueError: if `attention_layer_size` is not None, `attention_mechanism`
-        is a list, and its length does not match that of `attention_layer_size`.
+        is a list, and its length does not match that of `attention_layer_size`;
+        if `attention_layer_size` and `attention_layer` are set simultaneously.
     """
     super(AttentionWrapper, self).__init__(name=name)
     rnn_cell_impl.assert_like_rnncell("cell", cell)
@@ -1181,6 +1191,10 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
             "cell_input_fn must be callable, saw type: %s"
             % type(cell_input_fn).__name__)
 
+    if attention_layer_size is not None and attention_layer is not None:
+      raise ValueError("Only one of attention_layer_size and attention_layer "
+                       "should be set")
+
     if attention_layer_size is not None:
       attention_layer_sizes = tuple(
           attention_layer_size
@@ -1199,6 +1213,22 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
               dtype=attention_mechanisms[i].dtype)
           for i, attention_layer_size in enumerate(attention_layer_sizes))
       self._attention_layer_size = sum(attention_layer_sizes)
+    elif attention_layer is not None:
+      self._attention_layers = tuple(
+          attention_layer
+          if isinstance(attention_layer, (list, tuple))
+          else (attention_layer,))
+      if len(self._attention_layers) != len(attention_mechanisms):
+        raise ValueError(
+            "If provided, attention_layer must contain exactly one "
+            "layer per attention_mechanism, saw: %d vs %d"
+            % (len(self._attention_layers), len(attention_mechanisms)))
+      self._attention_layer_size = sum(
+          layer.compute_output_shape(
+              [None,
+               cell.output_size + mechanism.values.shape[-1].value])[-1].value
+          for layer, mechanism in zip(
+              self._attention_layers, attention_mechanisms))
     else:
       self._attention_layers = None
       self._attention_layer_size = sum(
diff --git a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
index 35c4b5bec1..345eb6cfaa 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
+++ b/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.contrib.signal.python.kernel_tests import test_util
 from tensorflow.contrib.signal.python.ops import mel_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 # mel spectrum constants and functions.
@@ -173,6 +174,18 @@ class LinearToMelTest(test.TestCase):
         rewritten_graph = test_util.grappler_optimize(g, [mel_matrix])
         self.assertEqual(1, len(rewritten_graph.node))
 
+  def test_num_spectrogram_bins_dynamic(self):
+    with self.test_session(use_gpu=True):
+      num_spectrogram_bins = array_ops.placeholder(shape=(),
+                                                   dtype=dtypes.int32)
+      mel_matrix_np = spectrogram_to_mel_matrix(
+          20, 129, 8000.0, 125.0, 3800.0)
+      mel_matrix = mel_ops.linear_to_mel_weight_matrix(
+          20, num_spectrogram_bins, 8000.0, 125.0, 3800.0)
+      self.assertAllClose(
+          mel_matrix_np,
+          mel_matrix.eval(feed_dict={num_spectrogram_bins: 129}), atol=3e-6)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/signal/python/ops/mel_ops.py b/tensorflow/contrib/signal/python/ops/mel_ops.py
index d1a36548d9..1e84006116 100644
--- a/tensorflow/contrib/signal/python/ops/mel_ops.py
+++ b/tensorflow/contrib/signal/python/ops/mel_ops.py
@@ -64,14 +64,11 @@ def _hertz_to_mel(frequencies_hertz, name=None):
         1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ))
 
 
-def _validate_arguments(num_mel_bins, num_spectrogram_bins, sample_rate,
+def _validate_arguments(num_mel_bins, sample_rate,
                         lower_edge_hertz, upper_edge_hertz, dtype):
   """Checks the inputs to linear_to_mel_weight_matrix."""
   if num_mel_bins <= 0:
     raise ValueError('num_mel_bins must be positive. Got: %s' % num_mel_bins)
-  if num_spectrogram_bins <= 0:
-    raise ValueError('num_spectrogram_bins must be positive. Got: %s' %
-                     num_spectrogram_bins)
   if sample_rate <= 0.0:
     raise ValueError('sample_rate must be positive. Got: %s' % sample_rate)
   if lower_edge_hertz < 0.0:
@@ -122,9 +119,9 @@ def linear_to_mel_weight_matrix(num_mel_bins=20,
 
   Args:
     num_mel_bins: Python int. How many bands in the resulting mel spectrum.
-    num_spectrogram_bins: Python int. How many bins there are in the source
-      spectrogram data, which is understood to be `fft_size // 2 + 1`, i.e. the
-      spectrogram only contains the nonredundant FFT bins.
+    num_spectrogram_bins: An integer `Tensor`. How many bins there are in the
+      source spectrogram data, which is understood to be `fft_size // 2 + 1`,
+      i.e. the spectrogram only contains the nonredundant FFT bins.
     sample_rate: Python float. Samples per second of the input signal used to
       create the spectrogram. We need this to figure out the actual frequencies
       for each spectrogram bin, which dictates how they are mapped into the mel
@@ -148,7 +145,10 @@ def linear_to_mel_weight_matrix(num_mel_bins=20,
   [mel]: https://en.wikipedia.org/wiki/Mel_scale
   """
   with ops.name_scope(name, 'linear_to_mel_weight_matrix') as name:
-    _validate_arguments(num_mel_bins, num_spectrogram_bins, sample_rate,
+    # Note: As num_spectrogram_bins is passed to `math_ops.linspace`
+    # and the validation is already done in linspace (both in shape function
+    # and in kernel), there is no need to validate num_spectrogram_bins here.
+    _validate_arguments(num_mel_bins, sample_rate,
                         lower_edge_hertz, upper_edge_hertz, dtype)
 
     # To preserve accuracy, we compute the matrix at float64 precision and then
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index 40f484fd78..746b955642 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -290,9 +290,9 @@ slim.stack(x, slim.conv2d, [(32, [3, 3]), (32, [1, 1]), (64, [3, 3]), (64, [1, 1
 
 In addition to the types of scope mechanisms in TensorFlow
 ([name_scope](https://www.tensorflow.org/api_docs/python/tf/name_scope),
-[variable_scope](https://www.tensorflow.org/api_docs/python/tf/variable_scope),
+[variable_scope](https://www.tensorflow.org/api_docs/python/tf/variable_scope)),
 TF-Slim adds a new scoping mechanism called
-[arg_scope](https://www.tensorflow.org/api_docs/python/tf/contrib/framework/arg_scope),
+[arg_scope](https://www.tensorflow.org/api_docs/python/tf/contrib/framework/arg_scope).
 This new scope allows a user to specify one or more operations and a set of
 arguments which will be passed to each of the operations defined in the
 `arg_scope`. This functionality is best illustrated by example. Consider the
@@ -761,8 +761,8 @@ parts:
 3. Finalization: (optionally) perform any final operation to compute metric
 values. For example, computing means, mins, maxes, etc.
 
-For example, to compute `mean_absolute_error`, two variables, a `count` and
-`total` variable are *initialized* to zero. During *aggregation*, we observed
+For example, to compute `mean_absolute_error`, two variables (`count` and
+`total`) are *initialized* to zero. During *aggregation*, we observed
 some set of predictions and labels, compute their absolute differences and add
 the total to `total`. Each time we observe another value,
 `count` is incremented. Finally, during *finalization*, `total` is divided
diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py
index 6a200de1ea..8a2c74742a 100644
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@@ -389,7 +389,7 @@ def create_train_op(total_loss,
     total_loss: A `Tensor` representing the total loss.
     optimizer: A tf.Optimizer to use for computing the gradients.
     global_step: A `Tensor` representing the global step variable. If left as
-      `_USE_GLOBAL_STEP`, then slim.variables.global_step() is used.
+      `_USE_GLOBAL_STEP`, then tf.contrib.framework.global_step() is used.
     update_ops: An optional list of updates to execute. If `update_ops` is
       `None`, then the update ops are set to the contents of the
       `tf.GraphKeys.UPDATE_OPS` collection. If `update_ops` is not `None`, but
@@ -578,7 +578,8 @@ def train(train_op,
     is_chief: Specifies whether or not the training is being run by the primary
       replica during replica training.
     global_step: The `Tensor` representing the global step. If left as `None`,
-      then slim.variables.get_or_create_global_step() is used.
+      then training_util.get_or_create_global_step(), that is,
+      tf.contrib.framework.global_step() is used.
     number_of_steps: The max number of gradient steps to take during training,
       as measured by 'global_step': training will stop if global_step is
       greater than 'number_of_steps'. If the value is left as None, training
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py
index 235a595de4..11c4214176 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py
@@ -207,7 +207,7 @@ def resnet_v1(inputs,
         net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
         if global_pool:
           # Global average pooling.
-          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
+          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keepdims=True)
         if num_classes is not None:
           net = layers.conv2d(
               net,
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
index 61665c9c8b..19e0538dd1 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
@@ -221,7 +221,7 @@ def resnet_v2(inputs,
             net, activation_fn=nn_ops.relu, scope='postnorm')
         if global_pool:
           # Global average pooling.
-          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
+          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keepdims=True)
         if num_classes is not None:
           net = layers_lib.conv2d(
               net,
diff --git a/tensorflow/contrib/tensor_forest/client/random_forest.py b/tensorflow/contrib/tensor_forest/client/random_forest.py
index 4abcc20ed3..35e8c92aba 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest.py
@@ -399,7 +399,7 @@ def get_combined_model_fn(model_fns):
   training ops: tf.group them.
   loss: average them.
   predictions: concat probabilities such that predictions[*][0-C1] are the
-    probablities for output 1 (where C1 is the number of classes in output 1),
+    probabilities for output 1 (where C1 is the number of classes in output 1),
     predictions[*][C1-(C1+C2)] are the probabilities for output 2 (where C2
     is the number of classes in output 2), etc.  Also stack predictions such
     that predictions[i][j] is the class prediction for example i and output j.
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
index cf0db788a4..06bfe871fd 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
@@ -80,7 +80,7 @@ REGISTER_OP("HardRoutingFunction")
    regression model that translates from node features to
    probabilities.
 
-  path_probility: `path_probability[i]` gives the probability of reaching each
+  path_probability: `path_probability[i]` gives the probability of reaching each
    node in `path[i]`.
   path: `path[i][j]` gives the jth node in the path taken by the ith data
    instance.
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
index c9df09bfda..1a055756c0 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
@@ -85,7 +85,7 @@ REGISTER_OP("StochasticHardRoutingFunction")
    regression model that translates from node features to
    probabilities.
 
-  path_probility: `path_probability[i]` gives the probability of reaching each
+  path_probability: `path_probability[i]` gives the probability of reaching each
    node in `path[i]`.
   path: `path[i][j]` gives the jth node in the path taken by the ith data
    instance.
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
index b0d8b832b5..7d092bbc24 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
@@ -81,7 +81,7 @@ REGISTER_OP("StochasticHardRoutingGradient")
   tree_biases: `tree_biases[i]` gives the bias of the logistic
    regression model that translates from node features to
    probabilities.
-  path_probility: `path_probability[i]` gives the probability of reaching each
+  path_probability: `path_probability[i]` gives the probability of reaching each
    node in `path[i]`.
   path: `path[i][j]` gives the jth node in the path taken by the ith data
    instance.
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
index 44997ec5d6..cefcc96051 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
@@ -421,7 +421,7 @@ double getChebyshevEpsilon(const std::vector<float>& mu1,
                            const std::vector<float>& mu2) {
   // Math time!!
   // We are trying to minimize d = |mu1 - x|^2 + |mu2 - y|^2 over the surface.
-  // Using Langrange multipliers, we get
+  // Using Lagrange multipliers, we get
   //   partial d / partial x = -2 mu1 + 2 x = lambda_1 1 + 2 lambda_3 x
   //   partial d / partial y = -2 mu2 + 2 y = lambda_2 1 - 2 lambda_3 y
   // or
@@ -485,7 +485,7 @@ double getChebyshevEpsilon(const std::vector<float>& mu1,
   }
 
   double sdiscrim = sqrt(discrim);
-  // TODO(thomaswc): Analyze whetever one of these is always closer.
+  // TODO(thomaswc): Analyze whatever one of these is always closer.
   double v1 = (-b + sdiscrim) / (2 * a);
   double v2 = (-b - sdiscrim) / (2 * a);
   double dist1 = getDistanceFromLambda3(v1, mu1, mu2);
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
index edbac67006..03aab1b61e 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
@@ -123,7 +123,7 @@ bool BestSplitDominatesRegression(const Tensor& total_sums,
                                   const Tensor& split_squares,
                                   int32 accumulator);
 
-// Performs booststrap_samples bootstrap samples of the best split's class
+// Performs bootstrap_samples bootstrap samples of the best split's class
 // counts and the second best splits's class counts, and returns true if at
 // least dominate_fraction of the time, the former has a better (lower)
 // Gini impurity.  Does not take over ownership of *rand.
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
index 328af28725..d3edb43733 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
@@ -60,7 +60,7 @@ class DecisionTreeResource : public ResourceBase {
   mutex* get_mutex() { return &mu_; }
 
   // Return the TreeNode for the leaf that the example ends up at according
-  // to decsion_tree_. Also fill in that leaf's depth if it isn't nullptr.
+  // to decision_tree_. Also fill in that leaf's depth if it isn't nullptr.
   int32 TraverseTree(const std::unique_ptr<TensorDataSet>& input_data,
                      int example, int32* depth, TreePath* path) const;
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
index bf2b2aaa3c..3db351c328 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
@@ -60,7 +60,7 @@ class InequalityDecisionNodeEvaluator : public BinaryDecisionNodeEvaluator {
   bool include_equals_;
 };
 
-// Evalutor for splits with multiple weighted features.
+// Evaluator for splits with multiple weighted features.
 class ObliqueInequalityDecisionNodeEvaluator
     : public BinaryDecisionNodeEvaluator {
  public:
diff --git a/tensorflow/contrib/tensor_forest/ops/model_ops.cc b/tensorflow/contrib/tensor_forest/ops/model_ops.cc
index 3099cccdf8..98124d519c 100644
--- a/tensorflow/contrib/tensor_forest/ops/model_ops.cc
+++ b/tensorflow/contrib/tensor_forest/ops/model_ops.cc
@@ -165,7 +165,7 @@ tree_handle: The handle to the tree.
 leaf_ids: `leaf_ids[i]` is the leaf id for input i.
 input_labels: The training batch's labels as a 1 or 2-d tensor.
   'input_labels[i][j]' gives the j-th label/target for the i-th input.
-input_weights: The training batch's eample weights as a 1-d tensor.
+input_weights: The training batch's weights as a 1-d tensor.
   'input_weights[i]' gives the weight for the i-th input.
 )doc");
 
diff --git a/tensorflow/contrib/tensor_forest/ops/stats_ops.cc b/tensorflow/contrib/tensor_forest/ops/stats_ops.cc
index e8b5c5d8a6..5be581aaec 100644
--- a/tensorflow/contrib/tensor_forest/ops/stats_ops.cc
+++ b/tensorflow/contrib/tensor_forest/ops/stats_ops.cc
@@ -75,7 +75,7 @@ REGISTER_OP("GrowTreeV4")
     .Attr("params: string")
     .Input("tree_handle: resource")
     .Input("stats_handle: resource")
-    .Input("finshed_nodes: int32")
+    .Input("finished_nodes: int32")
     .SetShapeFn(tensorflow::shape_inference::NoOutputs)
     .Doc(R"doc(
 Grows the tree for finished nodes and allocates waiting nodes.
@@ -83,7 +83,7 @@ Grows the tree for finished nodes and allocates waiting nodes.
 params: A serialized TensorForestParams proto.
 tree_handle: The handle to the tree.
 stats_handle: The handle to the stats.
-finshed_nodes: A 1-d Tensor of finished node ids from ProcessInput.
+finished_nodes: A 1-d Tensor of finished node ids from ProcessInput.
 )doc");
 
 REGISTER_OP("ProcessInputV4")
@@ -119,7 +119,7 @@ sparse_input_values: The values tensor from the SparseTensor input.
 sparse_input_shape: The shape tensor from the SparseTensor input.
 input_labels: The training batch's labels as a 1 or 2-d tensor.
   'input_labels[i][j]' gives the j-th label/target for the i-th input.
-input_weights: The training batch's eample weights as a 1-d tensor.
+input_weights: The training batch's weights as a 1-d tensor.
   'input_weights[i]' gives the weight for the i-th input.
 finished_nodes: A 1-d tensor of node ids that have finished and are ready to
   grow.
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index 3650b5d52f..b9bcbb170b 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -212,7 +212,7 @@ class ForestHParams(object):
     self.regression = getattr(self, 'regression', False)
 
     # Num_outputs is the actual number of outputs (a single prediction for
-    # classification, a N-dimenensional point for regression).
+    # classification, a N-dimensional point for regression).
     self.num_outputs = self.num_classes if self.regression else 1
 
     # Add an extra column to classes for storing counts, which is needed for
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 2f316767b3..f80b4f1b11 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -11,6 +11,7 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "py_test",
     "tf_cc_test",
     "tf_copts",
     "tf_cuda_library",
@@ -52,7 +53,6 @@ tf_custom_op_library(
         "ops/trt_engine_op.cc",
     ],
     deps = [
-        ":trt_engine_op_kernel",
         ":trt_shape_function",
         "//tensorflow/core:lib_proto_parsing",
     ] + if_tensorrt([
@@ -140,6 +140,7 @@ tf_custom_op_py_library(
     ]),
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:resources",
     ],
@@ -174,6 +175,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":wrap_conversion",
+        "//tensorflow/python:tf_optimizer",
     ],
 )
 
@@ -183,6 +185,7 @@ tf_py_wrap_cc(
     copts = tf_copts(),
     deps = [
         ":trt_conversion",
+        ":trt_engine_op_kernel",
         "//tensorflow/core:framework_lite",
         "//util/python:python_headers",
     ],
@@ -272,3 +275,19 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+py_test(
+    name = "tf_trt_integration_test",
+    srcs = ["test/tf_trt_integration_test.py"],
+    main = "test/tf_trt_integration_test.py",
+    srcs_version = "PY2AND3",
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        ":init_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
diff --git a/tensorflow/contrib/tensorrt/README.md b/tensorflow/contrib/tensorrt/README.md
index 6eafc1754c..687dee07e1 100644
--- a/tensorflow/contrib/tensorrt/README.md
+++ b/tensorflow/contrib/tensorrt/README.md
@@ -1,59 +1,29 @@
 # Using TensorRT in TensorFlow
 
-
-This module provides necessary bindings and introduces TRT_engine_op
-operator that wraps a subgraph in TensorRT. This is still a work in progress
-but should be useable with most common graphs.
+This module provides necessary bindings and introduces TRT_engine_op operator
+that wraps a subgraph in TensorRT. This is still a work in progress but should
+be useable with most common graphs.
 
 ## Compilation
 
-
-In order to compile the module, you need to have a local TensorRT
-installation ( libnvinfer.so and respective include files ). During the
-configuration step, TensorRT should be enabled and installation path
-should be set. If installed through package managers (deb,rpm),
-configure script should find the necessary components from the system
-automatically. If installed from tar packages, user has to set path to
-location where the library is installed during configuration.
+In order to compile the module, you need to have a local TensorRT installation
+(libnvinfer.so and respective include files). During the configuration step,
+TensorRT should be enabled and installation path should be set. If installed
+through package managers (deb,rpm), configure script should find the necessary
+components from the system automatically. If installed from tar packages, user
+has to set path to location where the library is installed during configuration.
 
 ```shell
 bazel build --config=cuda --config=opt //tensorflow/tools/pip_package:build_pip_package
 bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/
 ```
 
-After the installation of tensorflow package, TensorRT transformation
-will be available. An example use can be found in test/test_tftrt.py script
+After the installation of tensorflow package, TensorRT transformation will be
+available. An example use can be found in test/test_tftrt.py script
 
 ## Installing TensorRT 3.0.4
 
-In order to make use of TensorRT integration, you will need a local installation of TensorRT 3.0.4 from the [NVIDIA Developer website](https://developer.nvidia.com/tensorrt). Due to compiler compatibility, you will need to download and install the TensorRT 3.0.4 tarball for _Ubuntu 14.04_, i.e., **_TensorRT-3.0.4.Ubuntu-14.04.5.x86_64.cuda-9.0.cudnn7.0-tar.gz_**, even if you are using Ubuntu 16.04 or later.
-
-### Preparing TensorRT installation
-
-Once you have downloaded TensorRT-3.0.4.Ubuntu-14.04.5.x86_64.cuda-9.0.cudnn7.0-tar.gz, you will need to unpack it to an installation directory, which will be referred to as <install_dir>. Please replace <install_dir> with the full path of actual installation directory you choose in commands below.
-
-```shell
-cd <install_dir> && tar -zxf /path/to/TensorRT-3.0.4.Ubuntu-14.04.5.x86_64.cuda-9.0.cudnn7.0-tar.gz
-```
-
-After unpacking the binaries, you have several options to use them:
-
-#### To run TensorFlow as a user without superuser privileges
-
-For a regular user without any sudo rights, you should add TensorRT to your `$LD_LIBRARY_PATH`:
-
-  ```shell
-   export LD_LIBRARY_PATH=<install_dir>/TensorRT-3.0.4/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
-  ```
-
-Then you are ready to use TensorFlow-TensorRT integration. `$LD_LIBRARY_PATH` must contain the path to TensorRT installation for TensorFlow-TensorRT integration to work. If you are using a VirtualEnv-like setup, you can add the command above to your `bin/activate` script or to your `.bashrc` script.
-
-#### To run TensorFlow as a superuser
-
- When running as a superuser, such as in a container or via sudo, the `$LD_LIBRARY_PATH` approach above may not work. The following is preferred when the user has superuser privileges:
-
-  ```shell
-  echo "<install_dir>/TensorRT-3.0.4/lib" | sudo tee /etc/ld.so.conf.d/tensorrt304.conf && sudo ldconfig
-  ```
-
-  Please ensure that any existing deb package installation of TensorRT is removed before following these instructions to avoid package conflicts.
\ No newline at end of file
+In order to make use of TensorRT integration, you will need a local installation
+of TensorRT 3.0.4 from the [NVIDIA Developer website](https://developer.nvidia.com/tensorrt).
+Installation instructions for compatibility with TensorFlow are provided on the
+[TensorFlow Installation page](https://www.tensorflow.org/install/install_linux#nvidia_requirements_to_run_tensorflow_with_gpu_support).
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
index e663eed4dd..9c3698e5d1 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
+++ b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
@@ -19,6 +19,12 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 
+std::shared_ptr<TRTResourceManager>
+tensorflow::tensorrt::TRTResourceManager::instance() {
+  static std::shared_ptr<TRTResourceManager> instance_(new TRTResourceManager);
+  return instance_;
+}
+
 std::shared_ptr<tensorflow::ResourceMgr>
 tensorflow::tensorrt::TRTResourceManager::getManager(const string& op_name) {
   // mutex is held for lookup only. Most instantiations where mutex will be held
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
index 5f8ad491d3..bc15b51e05 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
@@ -29,11 +29,7 @@ class TRTResourceManager {
   TRTResourceManager() = default;
 
  public:
-  static std::shared_ptr<TRTResourceManager> instance() {
-    static std::shared_ptr<TRTResourceManager> instance_(
-        new TRTResourceManager);
-    return instance_;
-  }
+  static std::shared_ptr<TRTResourceManager> instance();
   // returns a manager for given op, if it doesn't exists it creates one
   std::shared_ptr<tensorflow::ResourceMgr> getManager(const string& op_name);
 
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
new file mode 100644
index 0000000000..7a47328762
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
@@ -0,0 +1,156 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import warnings
+import numpy as np
+
+from tensorflow.contrib import tensorrt as trt
+from tensorflow.core.protobuf import config_pb2 as cpb2
+from tensorflow.python.framework import constant_op as cop
+from tensorflow.python.framework import dtypes as dtypes
+from tensorflow.python.framework import importer as importer
+from tensorflow.python.framework import ops as ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops as aops
+from tensorflow.python.ops import nn as nn
+from tensorflow.python.ops import nn_ops as nn_ops
+from tensorflow.python.platform import googletest
+
+
+@test_util.with_c_api
+class IntegrationTest(test_util.TensorFlowTestCase):
+  """Class to test Tensorflow-TensorRT integration."""
+
+  def setUp(self):
+    """Setup method."""
+    super(IntegrationTest, self).setUp()
+    warnings.simplefilter("always")
+    inp_dims = (100, 24, 24, 2)
+    self._input = np.random.random_sample(inp_dims)
+    self._original_graph = self.get_simple_graph_def()
+    self._gpu_options = cpb2.GPUOptions(
+        per_process_gpu_memory_fraction=0.50)
+    self._config = cpb2.ConfigProto(gpu_options=self._gpu_options)
+    self._reference = self.run_graph(self._original_graph, self._input)
+
+  def get_simple_graph_def(self):
+    """Create a simple graph and return its graph_def."""
+    g = ops.Graph()
+    with g.as_default():
+      a = aops.placeholder(
+          dtype=dtypes.float32, shape=(None, 24, 24, 2), name="input")
+      e = cop.constant(
+          [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
+          name="weights",
+          dtype=dtypes.float32)
+      conv = nn.conv2d(
+          input=a,
+          filter=e,
+          strides=[1, 2, 2, 1],
+          padding="SAME",
+          name="conv")
+      b = cop.constant(
+          [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtypes.float32)
+      t = nn.bias_add(conv, b, name="biasAdd")
+      relu = nn.relu(t, "relu")
+      idty = aops.identity(relu, "ID")
+      v = nn_ops.max_pool(
+          idty, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
+      aops.squeeze(v, name="output")
+    return g.as_graph_def()
+
+  def run_graph(self, gdef, dumm_inp):
+    """Run given graphdef once."""
+    ops.reset_default_graph()
+    g = ops.Graph()
+    with g.as_default():
+      inp, out = importer.import_graph_def(
+          graph_def=gdef, return_elements=["input", "output"])
+      inp = inp.outputs[0]
+      out = out.outputs[0]
+    with self.test_session(
+        graph=g, config=self._config, use_gpu=True,
+        force_gpu=True) as sess:
+      val = sess.run(out, {inp: dumm_inp})
+    return val
+
+  # Use real data that is representative of the inference dataset
+  # for calibration. For this test script it is random data.
+  def run_calibration(self, gdef, dumm_inp):
+    """Run given calibration graph multiple times."""
+    ops.reset_default_graph()
+    g = ops.Graph()
+    with g.as_default():
+      inp, out = importer.import_graph_def(
+          graph_def=gdef, return_elements=["input", "output"])
+      inp = inp.outputs[0]
+      out = out.outputs[0]
+      # run over real calibration data here, we are mimicking a calibration
+      # set of 30 different batches. Use as much calibration data as you want
+    with self.test_session(
+        graph=g, config=self._config, use_gpu=True,
+        force_gpu=True) as sess:
+      for _ in range(30):
+        val = sess.run(out, {inp: dumm_inp})
+    return val
+
+  def get_trt_graph(self, mode):
+    """Return trt converted graph."""
+    if mode in  ["FP32", "FP16", "INT8"]:
+      return trt.create_inference_graph(
+          input_graph_def=self._original_graph,
+          outputs=["output"],
+          max_batch_size=self._input.shape[0],
+          max_workspace_size_bytes=1 << 25,
+          precision_mode=mode,  # TRT Engine precision "FP32","FP16" or "INT8"
+          minimum_segment_size=2  # minimum number of nodes in an engine
+          )
+    return None
+
+  def testFP32(self):
+    """Test FP32 conversion. Results should be identical to native case."""
+    trt_graph = self.get_trt_graph("FP32")
+    result = self.run_graph(trt_graph, self._input)
+    self.assertAllEqual(self._reference, result)
+    result1 = self.run_graph(trt_graph, self._input)
+    self.assertAllEqual(result1, result)
+
+  def testFP16(self):
+    """Test FP16 conversion. Results may be different from native case."""
+    trt_graph = self.get_trt_graph("FP16")
+    result = self.run_graph(trt_graph, self._input)
+    self.assertAllClose(self._reference, result, rtol=1.e-03)
+    result1 = self.run_graph(trt_graph, self._input)
+    self.assertAllEqual(result1, result)
+
+  def testINT8(self):
+    """Test INT8 conversion. Results may be different from native case."""
+    calib_graph = self.get_trt_graph("INT8")
+    result = self.run_calibration(calib_graph, self._input)
+    self.assertAllEqual(self._reference, result)
+    int8_graph = trt.calib_graph_to_infer_graph(calib_graph)
+    result = self.run_graph(int8_graph, self._input)
+    self.assertAllClose(self._reference, result, rtol=1.e-03)
+    result1 = self.run_graph(int8_graph, self._input)
+    self.assertAllEqual(result1, result)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
index 26793c80bf..9b593fecbb 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
@@ -60,7 +60,7 @@ def clip_covariance(
   # TODO(allenl): Smarter scaling here so that correlations are preserved when
   # fiddling with diagonal elements.
   diagonal = array_ops.matrix_diag_part(covariance_matrix)
-  maximum = math_ops.reduce_max(diagonal, axis=-1, keep_dims=True)
+  maximum = math_ops.reduce_max(diagonal, axis=-1, keepdims=True)
   new_diagonal = gen_math_ops.maximum(
       diagonal, maximum / maximum_variance_ratio)
   return array_ops.matrix_set_diag(
diff --git a/tensorflow/contrib/training/python/training/resample.py b/tensorflow/contrib/training/python/training/resample.py
index b16159bc16..7b8332b1d6 100644
--- a/tensorflow/contrib/training/python/training/resample.py
+++ b/tensorflow/contrib/training/python/training/resample.py
@@ -77,7 +77,7 @@ def resample_at_rate(inputs, rates, scope=None, seed=None, back_prop=False):
 
   Args:
     inputs: A list of tensors, each of which has a shape of `[batch_size, ...]`
-    rates: A tensor of shape `[batch_size]` contiaining the resampling rates
+    rates: A tensor of shape `[batch_size]` containing the resampling rates
        for each input.
     scope: Scope for the op.
     seed: Random seed to use.
diff --git a/tensorflow/contrib/training/python/training/sampling_ops.py b/tensorflow/contrib/training/python/training/sampling_ops.py
index ba888f87dc..7140f2a46d 100644
--- a/tensorflow/contrib/training/python/training/sampling_ops.py
+++ b/tensorflow/contrib/training/python/training/sampling_ops.py
@@ -123,7 +123,7 @@ def rejection_sample(tensors,
         batch_size=batch_size,
         num_threads=queue_threads)
 
-    # Queues return a single tensor if the list of enqued tensors is one. Since
+    # Queues return a single tensor if the list of enqueued tensors is one. Since
     # we want the type to always be the same, always return a list.
     if isinstance(minibatch, ops.Tensor):
       minibatch = [minibatch]
@@ -312,7 +312,7 @@ def _verify_input(tensor_list, labels, probs_list):
   """Verify that batched inputs are well-formed."""
   checked_probs_list = []
   for probs in probs_list:
-    # Since number of classes shouldn't change at runtime, probalities shape
+    # Since number of classes shouldn't change at runtime, probabilities shape
     # should be fully defined.
     probs.get_shape().assert_is_fully_defined()
 
@@ -407,7 +407,7 @@ def _calculate_acceptance_probabilities(init_probs, target_probs):
   ```
 
 
-  A solution for a_i in terms of the other variabes is the following:
+  A solution for a_i in terms of the other variables is the following:
     ```a_i = (t_i / p_i) / max_i[t_i / p_i]```
   """
   # Make list of t_i / p_i.
diff --git a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
index 99d486b183..39d75a0806 100644
--- a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
+++ b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
@@ -876,7 +876,7 @@ class SequenceQueueingStateSaver(object):
         ]):
           self._length = array_ops.identity(self._length)
 
-        # Only create barrier; enqueu and dequeue operations happen when you
+        # Only create barrier; enqueue and dequeue operations happen when you
         # access prefetch_op and next_batch.
         self._create_barrier()
         self._scope = scope
@@ -1637,7 +1637,7 @@ def _move_sparse_tensor_out_context(input_context, input_sequences, num_unroll):
 
   For `key, value` pairs in `input_context` with `SparseTensor` `value` removes
   them from `input_context` and transforms the `value` into a sequence and
-  then adding `key`, transformed `value` into `input_seuqences`.
+  then adding `key`, transformed `value` into `input_sequences`.
   The transformation is done by adding a new first dimension of `value_length`
   equal to that of the other values in input_sequences` and tiling the `value`
   every `num_unroll` steps.
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index a2ff29724b..ba1fd41565 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -145,6 +145,7 @@ load(
     "if_static",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@io_bazel_rules_closure//closure:defs.bzl", "closure_proto_library")
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
@@ -247,6 +248,15 @@ tf_nano_proto_library(
     deps = [":protos_all_cc"],
 )
 
+proto_library(
+    name = "example_protos",
+    srcs = [
+        "example/example.proto",
+        "example/feature.proto",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 exports_files([
     "framework/types.proto",
 ])
@@ -4066,3 +4076,9 @@ alias(
     actual = ":mobile_srcs",
     visibility = ["//visibility:public"],
 )
+
+closure_proto_library(
+    name = "example_protos_closure",
+    visibility = ["//visibility:public"],
+    deps = [":example_protos"],
+)
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt
new file mode 100644
index 0000000000..145d05de59
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt
@@ -0,0 +1,78 @@
+op {
+  graph_op_name: "ApplyAdaMax"
+  visibility: HIDDEN
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "v"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "beta1_power"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta1"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta2"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Ridge term. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var, m, and v tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the AdaMax algorithm."
+  description: <<END
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+v_t <- max(beta2 * v_{t-1}, abs(g))
+variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt b/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt
new file mode 100644
index 0000000000..763760176a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt
@@ -0,0 +1,41 @@
+op {
+  graph_op_name: "BroadcastTo"
+  in_arg {
+    name: "input"
+    description: <<END
+A Tensor to broadcast.
+END
+  }
+  in_arg {
+    name: "shape"
+    description: <<END
+An 1-D `int` Tensor. The shape of the desired output.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A Tensor.
+END
+  }
+  summary: "Broadcast an array for a compatible shape."
+  description: <<END
+Broadcasting is the process of making arrays to have compatible shapes
+for arithmetic operations. Two shapes are compatible if for each
+dimension pair they are either equal or one of them is one. When trying
+to broadcast a Tensor to a shape, it starts with the trailing dimensions,
+and works its way forward.
+
+For example,
+```
+>>> x = tf.constant([1, 2, 3])
+>>> y = tf.broadcast_to(x, [3, 3])
+>>> sess.run(y)
+array([[1, 2, 3],
+       [1, 2, 3],
+       [1, 2, 3]], dtype=int32)
+```
+In the above example, the input Tensor with the shape of `[1, 3]`
+is broadcasted to output Tensor with shape of `[3, 3]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ImageSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_ImageSummary.pbtxt
index 9b00f5b19d..56a3658fa0 100644
--- a/tensorflow/core/api_def/base_api/api_def_ImageSummary.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ImageSummary.pbtxt
@@ -61,7 +61,7 @@ build the `tag` of the summary values:
    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
 
 The `bad_color` argument is the color to use in the generated images for
-non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
 Each element must be in the range `[0, 255]` (It represents the value of a
 pixel in the output image).  Non-finite values in the input tensor are
 replaced by this tensor in the output image.  The default value is the color
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt
new file mode 100644
index 0000000000..a3f2188ba5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt
@@ -0,0 +1,72 @@
+op {
+  graph_op_name: "ResourceApplyAdaMax"
+  visibility: HIDDEN
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "v"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "beta1_power"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta1"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta2"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Ridge term. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var, m, and v tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the AdaMax algorithm."
+  description: <<END
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+v_t <- max(beta2 * v_{t-1}, abs(g))
+variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StringStrip.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringStrip.pbtxt
new file mode 100644
index 0000000000..12fbdfdf3f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StringStrip.pbtxt
@@ -0,0 +1,16 @@
+op {
+  graph_op_name: "StringStrip"
+  in_arg {
+    name: "input"
+    description: <<END
+A string `Tensor` of any shape.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A string `Tensor` of the same shape as the input.
+END
+  }
+  summary: "Strip leading and trailing whitespaces from the Tensor."
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ApplyAdaMax.pbtxt b/tensorflow/core/api_def/python_api/api_def_ApplyAdaMax.pbtxt
new file mode 100644
index 0000000000..e49a355b81
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ApplyAdaMax.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApplyAdaMax"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt b/tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt
new file mode 100644
index 0000000000..083eeced81
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BroadcastTo"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdaMax.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdaMax.pbtxt
new file mode 100644
index 0000000000..ca679e6889
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdaMax.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyAdaMax"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index b8e773503c..ba5a3eea3a 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -378,7 +378,7 @@ class BFCAllocator : public VisitableAllocator {
   inline int Log2FloorNonZero(uint64 n) {
 #if defined(__GNUC__)
     return 63 ^ __builtin_clzll(n);
-#elif defined(PLATFORM_WINDOWS)
+#elif defined(PLATFORM_WINDOWS) && (_WIN64)
     unsigned long index;
     _BitScanReverse64(&index, n);
     return index;
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index b2ef51d10b..245320c896 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -31,6 +31,10 @@ limitations under the License.
 
 #include "i_malloc.h"
 
+#ifdef _WIN32
+typedef unsigned int uint;
+#endif
+
 namespace tensorflow {
 
 class MklSubAllocator : public SubAllocator {
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index 0943b85fba..f6fe12e7ef 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -179,7 +179,7 @@ class StepSequenceInterface {
   virtual void RefreshStepIdSequenceAsync(int64 graph_key,
                                           const StatusCallback& done) = 0;
 
-  // Returns the the step_id that should be used for initiating a new execution
+  // Returns the step_id that should be used for initiating a new execution
   // on the specified graph. May return the same step_id multiple times if
   // RetireStepId or RefreshStepIdReservation is not called.
   virtual int64 NextStepId(int64 graph_key) = 0;
diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index dab53cba3e..b1d0127809 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h
@@ -111,7 +111,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE tensorflow::bfloat16 abs(
 }  // namespace numext
 }  // namespace Eigen
 
-#if defined(COMPILER_MSVC) && !defined(__clang__)
+#if defined(_MSC_VER) && !defined(__clang__)
 namespace std {
 template <>
 struct hash<Eigen::half> {
@@ -120,6 +120,6 @@ struct hash<Eigen::half> {
   }
 };
 }  // namespace std
-#endif  // COMPILER_MSVC
+#endif  // _MSC_VER
 
 #endif  // TENSORFLOW_FRAMEWORK_NUMERIC_TYPES_H_
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.h b/tensorflow/core/graph/mkl_tfconversion_pass.h
index 0562d8b3cd..84e50ee6e0 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.h
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.h
@@ -24,6 +24,10 @@ limitations under the License.
 #include <memory>
 #include "tensorflow/core/graph/graph.h"
 
+#ifdef _WIN32
+typedef unsigned int uint;
+#endif
+
 namespace tensorflow {
 // Interface to invoke the pass for unit test
 //
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index c6352c1448..352f08fede 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -196,10 +196,19 @@ TEST_F(SingleMachineTest, GraphOptimizations) {
   TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
   std::set<string> cost_nodes;
   for (const auto& node : metadata.cost_graph().node()) {
+#ifdef INTEL_MKL
+    // Skip the special nodes inserted by TF (and MKL): these are either
+    // prefixed with an underscore or contain "/_".
+    if (node.name()[0] == '_' || node.name().find("/_") != string::npos) {
+      continue;
+    }
+    cost_nodes.insert(node.name());
+#else
     // Skip nodes added by TF internally.
     if (node.name()[0] != '_') {
       cost_nodes.insert(node.name());
     }
+#endif
   }
   const std::set<string> expected_cost_nodes = {
       "zero",      "one",      "add",         "square",
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 3f573cda10..ad2db685fc 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -243,6 +243,7 @@ cc_library(
     deps = [
         ":graph_optimizer",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/custom_graph_optimizer.h b/tensorflow/core/grappler/optimizers/custom_graph_optimizer.h
index a80d46f416..4d7f8c98d0 100644
--- a/tensorflow/core/grappler/optimizers/custom_graph_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/custom_graph_optimizer.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -26,7 +27,8 @@ namespace grappler {
 class CustomGraphOptimizer : public GraphOptimizer {
  public:
   virtual ~CustomGraphOptimizer() {}
-  virtual Status Init() = 0;
+  virtual Status Init(const tensorflow::RewriterConfig_CustomGraphOptimizer*
+                          config = nullptr) = 0;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry_test.cc b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry_test.cc
index 629f5e83c1..bdb1ae8532 100644
--- a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry_test.cc
+++ b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry_test.cc
@@ -32,7 +32,10 @@ static const char* kTestOptimizerName = "Test";
 
 class TestGraphOptimizer : public CustomGraphOptimizer {
  public:
-  Status Init() override { return Status::OK(); }
+  Status Init(const tensorflow::RewriterConfig_CustomGraphOptimizer* config =
+                  nullptr) override {
+    return Status::OK();
+  }
   string name() const override { return kTestOptimizerName; }
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override {
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index d9a386b9be..9fcf07651b 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -36,7 +36,10 @@ class TestOptimizer : public CustomGraphOptimizer {
   TestOptimizer() {}
   string name() const override { return "test_optimizer"; }
 
-  Status Init() override { return Status::OK(); }
+  Status Init(const tensorflow::RewriterConfig_CustomGraphOptimizer* config =
+                  nullptr) override {
+    return Status::OK();
+  }
 
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override {
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index f7f6a9b505..201cd35798 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -617,6 +617,7 @@ cc_library(
         ":batch_space_ops",
         ":bcast_ops",
         ":bitcast_op",
+        ":broadcast_to_op",
         ":concat_op",
         ":constant_op",
         ":depth_space_ops",
@@ -668,6 +669,12 @@ tf_kernel_library(
     deps = ARRAY_DEPS,
 )
 
+tf_kernel_library(
+    name = "broadcast_to_op",
+    prefix = "broadcast_to_op",
+    deps = ARRAY_DEPS,
+)
+
 tf_kernel_library(
     name = "concat_op",
     prefix = "concat_op",
@@ -4227,6 +4234,7 @@ cc_library(
         ":regex_replace_op",
         ":string_join_op",
         ":string_split_op",
+        ":string_strip_op",
         ":string_to_hash_bucket_op",
         ":substr_op",
     ],
@@ -4271,6 +4279,12 @@ tf_kernel_library(
     deps = STRING_DEPS,
 )
 
+tf_kernel_library(
+    name = "string_strip_op",
+    prefix = "string_strip_op",
+    deps = STRING_DEPS,
+)
+
 tf_kernel_library(
     name = "substr_op",
     prefix = "substr_op",
@@ -5947,8 +5961,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5963,8 +5976,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5980,8 +5992,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6001,8 +6012,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6018,8 +6028,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6035,8 +6044,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6044,8 +6052,7 @@ tf_mkl_kernel_library(
     srcs = ["mkl_fused_batch_norm_op.cc"],
     deps = NN_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6053,8 +6060,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_aggregate_ops",
     deps = MATH_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6062,8 +6068,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_concat_op",
     deps = ARRAY_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6071,8 +6076,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_reshape_op",
     deps = ARRAY_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6080,8 +6084,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_identity_op",
     deps = ARRAY_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6089,8 +6092,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_lrn_op",
     deps = NN_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index edc88a0384..b4bce90841 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -136,7 +136,7 @@ class SharedBatchScheduler
     // (inclusive). If there is a need to quantize the batch sizes, i.e. only
     // submit batches whose size is in a small set of allowed sizes, that can be
     // done by adding padding in the process-batch callback.
-    int max_batch_size = 1000;
+    size_t max_batch_size = 1000;
 
     // If a task has been enqueued for this amount of time (in microseconds),
     // and a thread is available, the scheduler will immediately form a batch
@@ -157,7 +157,7 @@ class SharedBatchScheduler
     // If this limit is reached, Schedule() will return an UNAVAILABLE error.
     // See the class documentation above for guidelines on how to tune this
     // parameter.
-    int max_enqueued_batches = 10;
+    size_t max_enqueued_batches = 10;
   };
   Status AddQueue(const QueueOptions& options,
                   std::function<void(std::unique_ptr<Batch<TaskType>>)>
@@ -394,7 +394,7 @@ Status SharedBatchScheduler<TaskType>::AddQueue(
     std::function<void(std::unique_ptr<Batch<TaskType>>)>
         process_batch_callback,
     std::unique_ptr<BatchScheduler<TaskType>>* queue) {
-  if (options.max_batch_size <= 0) {
+  if (options.max_batch_size == 0) {
     return errors::InvalidArgument("max_batch_size must be positive; was ",
                                    options.max_batch_size);
   }
diff --git a/tensorflow/core/kernels/broadcast_to_op.cc b/tensorflow/core/kernels/broadcast_to_op.cc
new file mode 100644
index 0000000000..2810925bbc
--- /dev/null
+++ b/tensorflow/core/kernels/broadcast_to_op.cc
@@ -0,0 +1,91 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/broadcast_to_op.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class BroadcastToOp : public OpKernel {
+ public:
+  explicit BroadcastToOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input_tensor = ctx->input(0);
+    const TensorShape& input_shape = input_tensor.shape();
+
+    const Tensor& shape_tensor = ctx->input(1);
+
+    TensorShape output_shape;
+    OP_REQUIRES_OK(ctx,
+                   ctx->op_kernel().MakeShape(shape_tensor, &output_shape));
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_tensor));
+
+    const Device& d = ctx->eigen_device<Device>();
+    functor::BroadcastTo<Device, T>()(d, ctx, *output_tensor, output_shape,
+                                      input_tensor, input_shape);
+  }
+};
+
+// As MakeShape is able to handle both DT_INT32 and DT_INT64,
+// no need to have TypeConstraint for `Tidx`
+#define REGISTER_KERNEL(type)                                           \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("BroadcastTo").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      BroadcastToOp<CPUDevice, type>);
+
+TF_CALL_ALL_TYPES(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+
+namespace functor {
+#define DECLARE_GPU_TEMPLATE(Type)                              \
+  template <>                                                   \
+  void BroadcastTo<GPUDevice, Type>::operator()(                \
+      const GPUDevice& d, OpKernelContext* ctx, Tensor& output, \
+      const TensorShape& output_shape, const Tensor& input,     \
+      const TensorShape& input_shape);                          \
+  extern template struct BroadcastTo<GPUDevice, Type>;
+
+TF_CALL_GPU_ALL_TYPES(DECLARE_GPU_TEMPLATE);
+#undef DECLARE_GPU_KERNEL
+}  // namespace functor
+
+#define REGISTER_KERNEL(type)                            \
+  REGISTER_KERNEL_BUILDER(Name("BroadcastTo")            \
+                              .Device(DEVICE_GPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("shape"),      \
+                          BroadcastToOp<GPUDevice, type>);
+
+TF_CALL_GPU_ALL_TYPES(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/broadcast_to_op.h b/tensorflow/core/kernels/broadcast_to_op.h
new file mode 100644
index 0000000000..608e9b6ac9
--- /dev/null
+++ b/tensorflow/core/kernels/broadcast_to_op.h
@@ -0,0 +1,220 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_BROADCAST_TO_OP_H_
+#define TENSORFLOW_KERNELS_BROADCAST_TO_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct BroadcastTo {
+  void operator()(const Device &d, OpKernelContext *ctx, Tensor &output_tensor,
+                  const TensorShape &output_shape, const Tensor &input_tensor,
+                  const TensorShape &input_shape) {
+#define BROADCAST_SHAPE(broadcast, reshape, NDIMS, input_shape, output_shape) \
+  for (int i = 0; i < NDIMS; i++) {                                           \
+    OP_REQUIRES(ctx, (broadcast[i] % reshape[i] == 0),                        \
+                errors::InvalidArgument("invalid shape to broadcast from ",   \
+                                        input_shape.DebugString(), " to ",    \
+                                        output_shape.DebugString()));         \
+    broadcast[i] = broadcast[i] / reshape[i];                                 \
+  }
+
+    switch (output_shape.dims()) {
+      case 1: {
+        auto reshape = AsEigenDSizesWithPrefix<1>(input_shape);
+        auto broadcast = output_shape.AsEigenDSizes<1>();
+
+        BROADCAST_SHAPE(broadcast, reshape, 1, input_shape, output_shape);
+
+        auto output = output_tensor.tensor<T, 1>();
+        switch (input_shape.dims()) {
+          case 0: {
+            output.device(d) = output.constant(input_tensor.scalar<T>()());
+          } break;
+          case 1: {
+            auto input = input_tensor.tensor<T, 1>();
+            output.device(d) = input.broadcast(broadcast);
+          } break;
+          default:
+            ctx->CtxFailure(errors::InvalidArgument(
+                "invalid shape to broadcast from ", input_shape.DebugString(),
+                " to ", output_shape.DebugString()));
+            break;
+        }
+      } break;
+      case 2: {
+        auto reshape = AsEigenDSizesWithPrefix<2>(input_shape);
+        auto broadcast = output_shape.AsEigenDSizes<2>();
+
+        BROADCAST_SHAPE(broadcast, reshape, 2, input_shape, output_shape);
+
+        auto output = output_tensor.tensor<T, 2>();
+        switch (input_shape.dims()) {
+          case 0: {
+            output.device(d) = output.constant(input_tensor.scalar<T>()());
+          } break;
+          case 1: {
+            auto input = input_tensor.tensor<T, 1>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 2: {
+            auto input = input_tensor.tensor<T, 2>();
+            output.device(d) = input.broadcast(broadcast);
+          } break;
+          default:
+            ctx->CtxFailure(errors::InvalidArgument(
+                "invalid shape to broadcast from ", input_shape.DebugString(),
+                " to ", output_shape.DebugString()));
+            break;
+        }
+      } break;
+      case 3: {
+        auto reshape = AsEigenDSizesWithPrefix<3>(input_shape);
+        auto broadcast = output_shape.AsEigenDSizes<3>();
+
+        BROADCAST_SHAPE(broadcast, reshape, 3, input_shape, output_shape);
+
+        auto output = output_tensor.tensor<T, 3>();
+        switch (input_shape.dims()) {
+          case 0: {
+            output.device(d) = output.constant(input_tensor.scalar<T>()());
+          } break;
+          case 1: {
+            auto input = input_tensor.tensor<T, 1>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 2: {
+            auto input = input_tensor.tensor<T, 2>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 3: {
+            auto input = input_tensor.tensor<T, 3>();
+            output.device(d) = input.broadcast(broadcast);
+          } break;
+          default:
+            ctx->CtxFailure(errors::InvalidArgument(
+                "invalid shape to broadcast from ", input_shape.DebugString(),
+                " to ", output_shape.DebugString()));
+            break;
+        }
+      } break;
+      case 4: {
+        auto reshape = AsEigenDSizesWithPrefix<4>(input_shape);
+        auto broadcast = output_shape.AsEigenDSizes<4>();
+
+        BROADCAST_SHAPE(broadcast, reshape, 4, input_shape, output_shape);
+
+        auto output = output_tensor.tensor<T, 4>();
+        switch (input_shape.dims()) {
+          case 0: {
+            output.device(d) = output.constant(input_tensor.scalar<T>()());
+          } break;
+          case 1: {
+            auto input = input_tensor.tensor<T, 1>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 2: {
+            auto input = input_tensor.tensor<T, 2>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 3: {
+            auto input = input_tensor.tensor<T, 3>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 4: {
+            auto input = input_tensor.tensor<T, 4>();
+            output.device(d) = input.broadcast(broadcast);
+          } break;
+          default:
+            ctx->CtxFailure(errors::InvalidArgument(
+                "invalid shape to broadcast from ", input_shape.DebugString(),
+                " to ", output_shape.DebugString()));
+            break;
+        }
+      } break;
+      case 5: {
+        auto reshape = AsEigenDSizesWithPrefix<5>(input_shape);
+        auto broadcast = output_shape.AsEigenDSizes<5>();
+
+        BROADCAST_SHAPE(broadcast, reshape, 5, input_shape, output_shape);
+        auto output = output_tensor.tensor<T, 5>();
+        switch (input_shape.dims()) {
+          case 0: {
+            output.device(d) = output.constant(input_tensor.scalar<T>()());
+          } break;
+          case 1: {
+            auto input = input_tensor.tensor<T, 1>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 2: {
+            auto input = input_tensor.tensor<T, 2>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 3: {
+            auto input = input_tensor.tensor<T, 3>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 4: {
+            auto input = input_tensor.tensor<T, 4>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 5: {
+            auto input = input_tensor.tensor<T, 5>();
+            output.device(d) = input.broadcast(broadcast);
+          } break;
+          default:
+            ctx->CtxFailure(errors::InvalidArgument(
+                "invalid shape to broadcast from ", input_shape.DebugString(),
+                " to ", output_shape.DebugString()));
+            break;
+        }
+      } break;
+      default:
+        ctx->CtxFailure(errors::InvalidArgument(
+            "invalid shape to broadcast from ", input_shape.DebugString(),
+            " to ", output_shape.DebugString()));
+        break;
+    }
+  }
+
+ private:
+  template <int NDIMS>
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS> AsEigenDSizesWithPrefix(
+      const TensorShape &shape) const {
+    Eigen::DSizes<Eigen::DenseIndex, NDIMS> dsizes;
+    for (int d = 0; d < NDIMS - shape.dims(); d++) {
+      dsizes[d] = 1;
+    }
+    for (int d = NDIMS - shape.dims(); d < NDIMS; d++) {
+      dsizes[d] = shape.dim_size(d - (NDIMS - shape.dims()));
+    }
+    return dsizes;
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_BROADCAST_TO_OP_H_
diff --git a/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc b/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc
new file mode 100644
index 0000000000..6459571085
--- /dev/null
+++ b/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/broadcast_to_op.h"
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define INSTANTIATE_GPU_KERNEL(Type) \
+  template class functor::BroadcastTo<GPUDevice, Type>;
+TF_CALL_GPU_ALL_TYPES(INSTANTIATE_GPU_KERNEL);
+#undef INSTANTIATE_GPU_KERNEL
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index 4215c4541c..d2c8020bb6 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -139,9 +139,8 @@ class ConvParameters {
   bool ShouldIncludeWinogradNonfusedAlgo(
       se::StreamExecutor* stream_exec) const {
     // Skip this check for cuDNN 7 and newer.
-    se::port::StatusOr<std::tuple<int, int, int>> version =
-        stream_exec->AsDnn()->GetVersion();
-    if (version.ok() && std::get<0>(version.ValueOrDie()) >= 7) {
+    auto version = stream_exec->AsDnn()->GetVersion();
+    if (version.ok() && version.ValueOrDie().major_version() >= 7) {
       return true;
     }
     return ShouldIncludeWinogradNonfusedAlgoPreCudnn7<T>();
diff --git a/tensorflow/core/kernels/ctc_decoder_ops.cc b/tensorflow/core/kernels/ctc_decoder_ops.cc
index 96bdb6a241..8cadeac68d 100644
--- a/tensorflow/core/kernels/ctc_decoder_ops.cc
+++ b/tensorflow/core/kernels/ctc_decoder_ops.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/ctc/ctc_beam_search.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
+#include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
 
@@ -213,20 +214,29 @@ class CTCGreedyDecoderOp : public OpKernel {
 
     // Perform best path decoding
     std::vector<std::vector<std::vector<int> > > sequences(batch_size);
-    for (int b = 0; b < batch_size; ++b) {
-      sequences[b].resize(1);
-      auto& sequence = sequences[b][0];
-      int prev_indices = -1;
-      for (int t = 0; t < seq_len_t(b); ++t) {
-        int max_class_indices;
-        log_prob_t(b, 0) += -RowMax(input_list_t[t], b, &max_class_indices);
-        if (max_class_indices != blank_index &&
-            !(merge_repeated_ && max_class_indices == prev_indices)) {
-          sequence.push_back(max_class_indices);
+    auto decode = [&](const int64 begin, const int64 end) {
+      for (int b = begin; b < end; ++b) {
+        sequences[b].resize(1);
+        auto &sequence = sequences[b][0];
+        int prev_indices = -1;
+        for (int t = 0; t < seq_len_t(b); ++t) {
+          int max_class_indices;
+          log_prob_t(b, 0) += -RowMax(input_list_t[t], b, &max_class_indices);
+          if (max_class_indices != blank_index &&
+              !(merge_repeated_ && max_class_indices == prev_indices)) {
+            sequence.push_back(max_class_indices);
+          }
+          prev_indices = max_class_indices;
         }
-        prev_indices = max_class_indices;
       }
-    }
+    };
+
+    const int64 kCostPerUnit = 50 * max_time * num_classes;
+    const int64 total = batch_size;
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *ctx->device()->tensorflow_cpu_worker_threads();
+    Shard(worker_threads.num_threads, worker_threads.workers, total,
+          kCostPerUnit, decode);
 
     OP_REQUIRES_OK(
         ctx, decode_helper_.StoreAllDecodedSequences(
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index ea763ce85b..cda1402b03 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -312,9 +312,8 @@ class MklInputConversionOp : public OpKernel {
           VLOG(1) << "MklInputConversionOp: Shape is same, but format is "
                      "different, "
                   << "need to convert to same format";
-
-          // Convert input0, and keep input1 unchanged
-          // Create MklDnnShape for output mkl tensor based on input0
+          // TODO: For now, input0 is converted and input1 is unchanged
+          //       we should choose the optimal MKL format to convert to.
           Tensor* tensor_out;
           MklDnnShape mkl_output_mkl_shape;
           mkl_output_mkl_shape.SetMklTensor(true);
@@ -362,7 +361,8 @@ class MklInputConversionOp : public OpKernel {
       // with MKL tensors)
       VLOG(1) << "MklInputConversionOp: Broadcast needed, "
               << "converted MKL inputs to TF format";
-
+      // TODO: Cleanup op_data_type and has_avx512f_ after these two parameters
+      //       are removed from ConvertMklToTf
       MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
                                            op_data_type, has_avx512f_,
                                            kInputIndex_0);
@@ -403,19 +403,7 @@ class MklInputConversionOp : public OpKernel {
     }
 
     // Broadcast is needed if the shapes are not the same
-    bool broadcast_needed;
-
-    size_t in0_size = 1;
-    for (size_t i = 0; i < mkl_shape->GetDimension(); ++i)
-      in0_size *= mkl_shape->TfDimSize(i);
-
-    size_t in1_size = 1;
-    for (size_t i = 0; i < tf_tensor->shape().dims(); ++i)
-      in1_size *= tf_tensor->shape().dim_size(i);
-
-    broadcast_needed = (in0_size != in1_size);
-
-    if (!broadcast_needed) {
+    if (mkl_shape->GetTfShape().num_elements() == tf_tensor->shape().num_elements() ) {
       // Both shapes are same, convert the TF input to MKL
       VLOG(1) << "MklInputConversionOp: No broadcast needed.";
       VLOG(1) << "MklInputConversionOp: Converting input " << tf_tensor_index
@@ -446,10 +434,19 @@ class MklInputConversionOp : public OpKernel {
 
       // Create reorder between tensorflow layout and Mkl layout if necessary
       std::vector<primitive> net;
-      tf_input.CheckReorderToOpMem(
+      bool reordered = tf_input.CheckReorderToOpMem(
                    memory::primitive_desc(output_mkl_md, cpu_engine),
                    tensor_out, &net);
-      stream(stream::kind::eager).submit(net).wait();
+      if(!reordered) {
+        // This is the case that the TF tensor has the same shape and format of
+        // mkl tensor. However, tf_tensor can not be simply forwarded to the output
+        // tensor since mkl data tensor is always one dimensional tensor. 
+        // Tensor::CopyFrom shares the buffer of the other tensor while set its shape
+        // to the other tensor. 
+        tensor_out->CopyFrom(*tf_tensor, tensor_out->shape());
+      }
+      else  
+        stream(stream::kind::eager).submit(net).wait();
 
       // -- The tensor in MKL format passes through --
       ForwardMklTensorInToOut(context, mkl_tensor_index, mkl_tensor_index);
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 0a0f69522f..1ed43834dd 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -441,7 +441,9 @@ class MklReluOpBase : public OpKernel {
       // Allocate output and MklDnnShape tensors separately for possible
       // in-place operation
       OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                      {src_index}, dst_index, tf_shape_dst, &dst_tensor));
+                                      {static_cast<const int>(src_index)},
+                                      static_cast<const int>(dst_index),
+                                      tf_shape_dst, &dst_tensor));
       AllocateOutputSetMklShape(context, dst_index, dnn_shape_dst);
 
       // Destination memory descriptor is same as source memory descriptor.
@@ -611,7 +613,9 @@ class MklReluGradOpBase : public OpKernel {
       // Allocate diff_src and MklDnnShape tensors separately for possible
       // in-place operation
       OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                      {diff_dst_index}, diff_src_index, tf_shape_diff_src,
+                                      {static_cast<const int>(diff_dst_index)},
+                                      static_cast<const int>(diff_src_index),
+                                      tf_shape_diff_src,
                                       &diff_src_tensor));
       AllocateOutputSetMklShape(context, diff_src_index, dnn_shape_diff_src);
 
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
index bcbdbee058..4b630809c5 100644
--- a/tensorflow/core/kernels/roll_op.cc
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -254,8 +254,11 @@ class RollOp : public OpKernel {
     // total modulo sum of shifts for each dimension
     gtl::InlinedVector<int, 4> shift_mod_sum(num_dims, 0);
     for (int i = 0; i < num_shifts; i++) {
-      const int axis = axis_flat(i);
-      OP_REQUIRES(context, axis < num_dims,
+      int axis = axis_flat(i);
+      if (axis < 0) {
+        axis += num_dims;
+      }
+      OP_REQUIRES(context, 0 <= axis && axis < num_dims,
                   errors::InvalidArgument("axis ", axis, " is out of range"));
       const int ds = std::max<int>(static_cast<int>(input.dim_size(axis)), 1);
       const int sum = shift_mod_sum[axis] + static_cast<int>(shift_flat(i));
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index 183e5a1d58..bedd965966 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -16,6 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 
+
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
+
 // This file requires the following include because it uses CudaAtomicMax:
 // #include "tensorflow/core/util/cuda_kernel_helper.h"
 
diff --git a/tensorflow/core/kernels/string_strip_op.cc b/tensorflow/core/kernels/string_strip_op.cc
new file mode 100644
index 0000000000..ae700f4294
--- /dev/null
+++ b/tensorflow/core/kernels/string_strip_op.cc
@@ -0,0 +1,53 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/string_ops.cc.
+
+#include <string>
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace tensorflow {
+
+class StringStripOp : public OpKernel {
+ public:
+  explicit StringStripOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+    Tensor* output_tensor;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(0, input_tensor->shape(), &output_tensor));
+
+    const auto input = input_tensor->flat<string>();
+    auto output = output_tensor->flat<string>();
+
+    for (int64 i = 0; i < input.size(); ++i) {
+      StringPiece entry(input(i));
+      str_util::RemoveWhitespaceContext(&entry);
+      output(i) = entry.ToString();
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("StringStrip").Device(DEVICE_CPU), StringStripOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index f53c567c4d..5b13b10937 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -330,6 +330,27 @@ struct ApplyAdamSYCL {
 template <typename T>
 struct ApplyAdam<CPUDevice, T> : ApplyAdamNonCuda<CPUDevice, T> {};
 
+template <typename Device, typename T>
+struct ApplyAdaMaxNonCuda {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad) {
+    m.device(d) += (grad - m) * (T(1) - beta1());
+    // Here v is u in section 7.1
+    v.device(d) = (beta2() * v).cwiseMax(grad.abs());
+    // var is θ in section 7.1
+    var.device(d) -= lr() / (T(1) - beta1_power()) * (m / (v + epsilon()));
+  }
+};
+
+template <typename T>
+struct ApplyAdaMax<CPUDevice, T> : ApplyAdaMaxNonCuda<CPUDevice, T> {};
+
 template <typename T>
 struct ApplyRMSProp<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
@@ -2751,6 +2772,135 @@ REGISTER_KERNELS(GPU, double);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
+template <typename Device, typename T>
+class ApplyAdaMaxOp : public OpKernel {
+ public:
+  explicit ApplyAdaMaxOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 0, use_exclusive_lock_, false, &var));
+    Tensor m;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 1, use_exclusive_lock_, false, &m));
+    Tensor v;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 2, use_exclusive_lock_, false, &v));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, m.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    OP_REQUIRES(
+        ctx, v.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(2)));
+
+    const Tensor& beta1_power = ctx->input(3);
+    const Tensor& lr = ctx->input(4);
+    const Tensor& beta1 = ctx->input(5);
+    const Tensor& beta2 = ctx->input(6);
+    const Tensor& epsilon = ctx->input(7);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power.shape()),
+                errors::InvalidArgument("beta1_power is not a scalar: ",
+                                        beta1_power.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar : ",
+                                        lr.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()),
+                errors::InvalidArgument("beta1 is not a scalar: ",
+                                        beta1.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()),
+                errors::InvalidArgument("beta2 is not a scalar: ",
+                                        beta2.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon.shape().DebugString()));
+
+    const Tensor& grad = ctx->input(8);
+    OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()),
+                errors::InvalidArgument("var and m do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        m.shape().DebugString()));
+    OP_REQUIRES(ctx, var.shape().IsSameSize(v.shape()),
+                errors::InvalidArgument("var and v do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        v.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyAdaMax<Device, T>()(
+        device, var.flat<T>(), m.flat<T>(), v.flat<T>(),
+        beta1_power.scalar<T>(), lr.scalar<T>(),
+        beta1.scalar<T>(), beta2.scalar<T>(), epsilon.scalar<T>(),
+        grad.flat<T>());
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(D, T)                                     \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("ApplyAdaMax").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyAdaMaxOp<D##Device, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdaMax")                \
+                              .HostMemory("var")                   \
+                              .HostMemory("m")                     \
+                              .HostMemory("v")                     \
+                              .Device(DEVICE_##D)                  \
+                              .TypeConstraint<T>("T"),             \
+                          ApplyAdaMaxOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                   \
+  template <>                                                 \
+  void ApplyAdaMax<GPUDevice, T>::operator()(                   \
+      const GPUDevice& d, typename TTypes<T>::Flat var,       \
+      typename TTypes<T>::Flat m, typename TTypes<T>::Flat v, \
+      typename TTypes<T>::ConstScalar beta1_power,            \
+      typename TTypes<T>::ConstScalar lr,                     \
+      typename TTypes<T>::ConstScalar beta1,                  \
+      typename TTypes<T>::ConstScalar beta2,                  \
+      typename TTypes<T>::ConstScalar epsilon,                \
+      typename TTypes<T>::ConstFlat grad); \
+  extern template struct ApplyAdaMax<GPUDevice, T>;
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, Eigen::half);
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
 template <typename Device, typename T>
 class ApplyRMSPropOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index 7ee956053a..f536a61eb0 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -139,6 +139,18 @@ struct ApplyAdam {
                   typename TTypes<T>::ConstFlat grad, bool use_nesterov);
 };
 
+template <typename Device, typename T>
+struct ApplyAdaMax {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
 template <typename Device, typename T>
 struct ApplyRMSProp {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index 0376a3b2c6..2aa17f2a0f 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -142,6 +142,32 @@ struct ApplyAdam<GPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyAdaMax<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+    const auto one = static_cast<T>(1.0);
+    m.device(d) =
+        m + (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
+                (grad - m);
+    v.device(d) =
+        (beta2.reshape(single).broadcast(bcast) * v).cwiseMax(grad.abs());
+    var.device(d) -=
+        lr / (beta1_power.constant(one) -
+                 beta1_power).reshape(single).broadcast(bcast) *
+                     (m / (v + epsilon));
+  }
+};
+
 template <typename T>
 struct ApplyRMSProp<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
@@ -278,6 +304,10 @@ template struct functor::ApplyAdam<GPUDevice, Eigen::half>;
 template struct functor::ApplyAdam<GPUDevice, float>;
 template struct functor::ApplyAdam<GPUDevice, double>;
 
+template struct functor::ApplyAdaMax<GPUDevice, Eigen::half>;
+template struct functor::ApplyAdaMax<GPUDevice, float>;
+template struct functor::ApplyAdaMax<GPUDevice, double>;
+
 template struct functor::ApplyRMSProp<GPUDevice, Eigen::half>;
 template struct functor::ApplyRMSProp<GPUDevice, float>;
 template struct functor::ApplyRMSProp<GPUDevice, double>;
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index e7c24387a4..2c0576ff10 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -88,15 +88,13 @@ struct bfloat16 {
       : bfloat16(static_cast<float>(val)) {}
 
   B16_DEVICE_FUNC explicit operator float() const {
-    float result;
+    float result = 0;
 
     uint16_t* q = reinterpret_cast<uint16_t*>(&result);
 
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
     q[0] = value;
-    q[1] = 0;
 #else
-    q[0] = 0;
     q[1] = value;
 #endif
     return result;
diff --git a/tensorflow/core/lib/gtl/manual_constructor.h b/tensorflow/core/lib/gtl/manual_constructor.h
index 0a76e0962e..0176cdc94d 100644
--- a/tensorflow/core/lib/gtl/manual_constructor.h
+++ b/tensorflow/core/lib/gtl/manual_constructor.h
@@ -53,7 +53,7 @@ template <int size>
 struct AlignType<0, size> {
   typedef char result[size];
 };
-#if defined(COMPILER_MSVC)
+#if defined(_MSC_VER)
 #define TF_LIB_GTL_ALIGN_ATTRIBUTE(X) __declspec(align(X))
 #define TF_LIB_GTL_ALIGN_OF(T) __alignof(T)
 #elif defined(COMPILER_GCC3) || __GNUC__ >= 3 || defined(__APPLE__) || \
diff --git a/tensorflow/core/lib/strings/stringprintf.cc b/tensorflow/core/lib/strings/stringprintf.cc
index 03eba4c851..bbffa062a9 100644
--- a/tensorflow/core/lib/strings/stringprintf.cc
+++ b/tensorflow/core/lib/strings/stringprintf.cc
@@ -22,12 +22,6 @@ limitations under the License.
 namespace tensorflow {
 namespace strings {
 
-#ifdef COMPILER_MSVC
-enum { IS_COMPILER_MSVC = 1 };
-#else
-enum { IS_COMPILER_MSVC = 0 };
-#endif
-
 void Appendv(string* dst, const char* format, va_list ap) {
   // First try with a small fixed size buffer
   static const int kSpaceLength = 1024;
@@ -48,13 +42,13 @@ void Appendv(string* dst, const char* format, va_list ap) {
       return;
     }
 
-    if (IS_COMPILER_MSVC) {
+#ifdef _MSC_VER
       // Error or MSVC running out of space.  MSVC 8.0 and higher
       // can be asked about space needed with the special idiom below:
       va_copy(backup_ap, ap);
       result = vsnprintf(nullptr, 0, format, backup_ap);
       va_end(backup_ap);
-    }
+#endif
 
     if (result < 0) {
       // Just an error.
diff --git a/tensorflow/core/lib/strings/stringprintf_test.cc b/tensorflow/core/lib/strings/stringprintf_test.cc
index d61a1a945a..02cf4cbcad 100644
--- a/tensorflow/core/lib/strings/stringprintf_test.cc
+++ b/tensorflow/core/lib/strings/stringprintf_test.cc
@@ -30,9 +30,9 @@ TEST(PrintfTest, Empty) {
 
 TEST(PrintfTest, Misc) {
 // MSVC does not support $ format specifier.
-#if !defined(COMPILER_MSVC)
+#if !defined(_MSC_VER)
   EXPECT_EQ("123hello w", Printf("%3$d%2$s %1$c", 'w', "hello", 123));
-#endif  // !COMPILER_MSVC
+#endif  // !_MSC_VER
 }
 
 TEST(AppendfTest, Empty) {
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 2a8b9f9bee..88fc03826a 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -429,6 +429,58 @@ REGISTER_OP("UnravelIndex")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) { return Status::OK(); });
 
+REGISTER_OP("BroadcastTo")
+    .Input("input: T")
+    .Input("shape: Tidx")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle in = c->input(0);
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &out));
+
+      if (!c->RankKnown(out)) {
+        // We have no information about the shape of the output.
+        c->set_output(0, out);
+        return Status::OK();
+      }
+
+      if (!c->RankKnown(in)) {
+        // We have no information about the shape of the input,
+        // nothing to do here.
+        c->set_output(0, out);
+        return Status::OK();
+      }
+      if (c->Rank(out) < c->Rank(in)) {
+        return errors::InvalidArgument("Cannot broadcast a tensor with shape ",
+                                       c->DebugString(in), " shape ",
+                                       c->DebugString(out));
+      }
+
+      int32 in_offset = c->Rank(out) - c->Rank(in);
+      for (int32 i = 0; i < c->Rank(out); ++i) {
+        DimensionHandle dim = c->Dim(out, i);
+        if (c->ValueKnown(dim)) {
+          // The first in_offset dimensions for input will be expanded with 1,
+          // so no check needed.
+          if (i >= in_offset) {
+            DimensionHandle in_dim = c->Dim(in, i - in_offset);
+            if (c->ValueKnown(in_dim)) {
+              if (c->Value(dim) % c->Value(in_dim) != 0) {
+                return errors::InvalidArgument(
+                    "Cannot broadcast a tensor with shape ", c->DebugString(in),
+                    " shape ", c->DebugString(out));
+              }
+            }
+          }
+        }
+      }
+
+      c->set_output(0, out);
+      return Status::OK();
+    });
+
 // --------------------------------------------------------------------------
 // TODO(josh11b): Remove the >= 2 constraint, once we can rewrite the graph
 // in the N == 1 case to remove the node.
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 67c6c58fe2..4ba3f15ef0 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -148,7 +148,11 @@ REGISTER_OP("BytesProducedStatsDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle tag_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("LatencyStatsDataset")
     .Input("input_dataset: variant")
@@ -156,7 +160,11 @@ REGISTER_OP("LatencyStatsDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle tag_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("SetStatsAggregatorDataset")
     .Input("input_dataset: variant")
@@ -206,7 +214,12 @@ REGISTER_OP("PrefetchDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // buffer_size should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("ScanDataset")
     .Input("input_dataset: variant")
@@ -290,7 +303,12 @@ REGISTER_OP("BatchDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // batch_size should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 // TODO(mrry): move SlideDataset to contrib in the future.
 REGISTER_OP("SlideDataset")
@@ -300,7 +318,13 @@ REGISTER_OP("SlideDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // window_size and stride should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("PaddedBatchDataset")
     .Input("input_dataset: variant")
@@ -330,7 +354,14 @@ REGISTER_OP("DenseToSparseBatchDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // batch_size should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      // row_shape should be a 1-D vector.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("RangeDataset")
     .Input("start: int64")
@@ -341,7 +372,14 @@ REGISTER_OP("RangeDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // start, stop, and step should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("RandomDataset")
     .Input("seed: int64")
@@ -351,7 +389,13 @@ REGISTER_OP("RandomDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // buffer_size, seed, and seed2 should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("ShuffleDataset")
     .Input("input_dataset: variant")
@@ -362,7 +406,14 @@ REGISTER_OP("ShuffleDataset")
     .Attr("reshuffle_each_iteration: bool = true")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // buffer_size, seed, and seed2 should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("ShuffleAndRepeatDataset")
     .Input("input_dataset: variant")
@@ -373,7 +424,15 @@ REGISTER_OP("ShuffleAndRepeatDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // buffer_size, seed, seed2, and count should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("CacheDataset")
     .Input("input_dataset: variant")
@@ -381,7 +440,12 @@ REGISTER_OP("CacheDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // filename should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("TextLineDataset")
     .Input("filenames: string")
@@ -390,10 +454,16 @@ REGISTER_OP("TextLineDataset")
     .Output("handle: variant")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): validate
-                                                // that `filenames` is
-                                                // a scalar or a
-                                                // vector.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `filenames` must be a scalar or a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
+      return shape_inference::ScalarShape(c);
+      // `compression_type` could only be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      // `buffer_size` could only be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+    });
 
 REGISTER_OP("SqlDataset")
     .Input("driver_name: string")
@@ -404,7 +474,14 @@ REGISTER_OP("SqlDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // driver_name, data_source_name, and query should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("FixedLengthRecordDataset")
     .Input("filenames: string")
@@ -415,7 +492,18 @@ REGISTER_OP("FixedLengthRecordDataset")
     .Output("handle: variant")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `filenames` must be a scalar or a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
+      // header_bytes, record_bytes, footer_bytes, buffer_size should be
+      // scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("TFRecordDataset")
     .Input("filenames: string")
@@ -424,7 +512,16 @@ REGISTER_OP("TFRecordDataset")
     .Output("handle: variant")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `filenames` must be a scalar or a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
+      // `compression_type` could only be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      // `buffer_size` could only be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("Iterator")
     .Output("handle: resource")
@@ -540,7 +637,12 @@ REGISTER_OP("PrependFromQueueAndPaddedBatchDataset")
     // length of `output_types` is `N`, the `output_shapes` are
     // (as far as possible to tell statically) compatible with `padded_shapes`,
     // and that `padding_values` are all scalars.
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // batch_size should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("EnqueueInQueueDataset")
     .Input("queue: variant")
diff --git a/tensorflow/core/ops/manip_ops.cc b/tensorflow/core/ops/manip_ops.cc
index 95b4774fe6..e180f3d5f6 100644
--- a/tensorflow/core/ops/manip_ops.cc
+++ b/tensorflow/core/ops/manip_ops.cc
@@ -28,6 +28,17 @@ REGISTER_OP("Roll")
     .Attr("T: type")
     .Attr("Tshift: {int32,int64}")
     .Attr("Taxis: {int32,int64}")
-    .SetShapeFn(shape_inference::UnchangedShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // The `input` must be 1-D or higher
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &unused));
+      // The `shift` must be scalar or 1-D.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &unused));
+      // The `axis` must be scalar or 1-D.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
+      // Validate 'shift' is the same shape as axis'.
+      TF_RETURN_IF_ERROR(c->Merge(c->input(1), c->input(2), &unused));
+      return shape_inference::UnchangedShape(c);
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 6dc3d9df31..bb46dafd42 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1535,6 +1535,7 @@ REGISTER_OP("__MklDummyConv2DWithBias")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
 Dummy node that enables fusing Conv2D and BiasAdd operator for MKL. This node
 does not perform anything. It is just created as an intermediate output of
@@ -1561,6 +1562,7 @@ REGISTER_OP("_MklConv2DWithBias")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
 MKL version of Conv2D and BiasAdd operator. Uses MKL DNN APIs to perform
 2D convolution and add Bias to the output of convolution.
@@ -1683,6 +1685,7 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+#ifdef INTEL_MKL_ML
 REGISTER_OP("_MklConv2DWithBiasBackpropBias")
     .Input("out_backprop: T")
     .Input("mkl_out_backprop: uint8")
@@ -1699,6 +1702,7 @@ gradients of convolution with respect to the bias.
 NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
+#endif
 
 REGISTER_OP("_MklConv2DBackpropInput")
     .Input("input_sizes: int32")
@@ -2156,6 +2160,7 @@ REGISTER_OP("_MklToTf")
     .Output("output: T")
     .Attr("T: {half, float, double}")
     .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 MKL operator to convert a tensor from MKL layout to TensorFlow layout.
 
@@ -2177,6 +2182,7 @@ REGISTER_OP("_MklInputConversion")
         "T: {half, float, double, uint8, int8, uint16, int16, int32, int64, "
         "complex64, complex128}")
     .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 MKL operator to process the inputs to an elementwise MKL op. Both inputs
 need to be either in TF or in MKL format. This op is added before every
diff --git a/tensorflow/core/ops/random_ops.cc b/tensorflow/core/ops/random_ops.cc
index f6c668f5c9..416ce9c0d8 100644
--- a/tensorflow/core/ops/random_ops.cc
+++ b/tensorflow/core/ops/random_ops.cc
@@ -43,7 +43,12 @@ REGISTER_OP("RandomUniformInt")
     .Attr("seed2: int = 0")
     .Attr("Tout: {int32, int64}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(shape_inference::RandomShape);
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::RandomShape(c);
+    });
 
 REGISTER_OP("RandomStandardNormal")
     .Input("shape: T")
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 05f216a83e..469f193cf4 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -123,6 +123,11 @@ REGISTER_OP("StringSplit")
       return Status::OK();
     });
 
+REGISTER_OP("StringStrip")
+    .Input("input: string")
+    .Output("output: string")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
 REGISTER_OP("EncodeBase64")
     .Input("input: string")
     .Output("output: string")
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index 6ce9595fb6..dc7b588898 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -737,6 +737,57 @@ REGISTER_OP("ResourceApplyAdam")
       return ApplyAdamShapeFn(c, false /* sparse */);
     });
 
+static Status ApplyAdaMaxShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // m
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // v
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));       // beta1_power
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));       // beta1
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));       // beta2
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));       // epsilon
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 8 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("ApplyAdaMax")
+    .Input("var: Ref(T)")
+    .Input("m: Ref(T)")
+    .Input("v: Ref(T)")
+    .Input("beta1_power: T")
+    .Input("lr: T")
+    .Input("beta1: T")
+    .Input("beta2: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdaMaxShapeFn(c, false /* sparse */);
+    });
+
+REGISTER_OP("ResourceApplyAdaMax")
+    .Input("var: resource")
+    .Input("m: resource")
+    .Input("v: resource")
+    .Input("beta1_power: T")
+    .Input("lr: T")
+    .Input("beta1: T")
+    .Input("beta2: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdaMaxShapeFn(c, false /* sparse */);
+    });
+
 static Status ApplyRMSPropShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
   ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc
index 2b874da198..c6e5777c26 100644
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <android/log.h>
 #include <iostream>
 #include <sstream>
+#include <cstring>
 #endif
 
 #include <stdlib.h>
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 9a71fbe2b7..a8cb40502c 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -109,6 +109,8 @@ class LibHDFS {
 // in the libhdfs documentation.
 #if defined(PLATFORM_WINDOWS)
     const char* kLibHdfsDso = "hdfs.dll";
+#elif defined(MACOS) || defined(TARGET_OS_MAC)
+    const char* kLibHdfsDso = "libhdfs.dylib";
 #else
     const char* kLibHdfsDso = "libhdfs.so";
 #endif
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 9b6202e7b4..029b27cd04 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -6,6 +6,8 @@ option java_outer_classname = "RewriterConfigProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
 
+import "tensorflow/core/framework/attr_value.proto";
+
 message AutoParallelOptions {
   bool enable = 1;
   int32 num_replicas = 2;
@@ -119,4 +121,13 @@ message RewriterConfig {
   // Custom registered optimizers will be run after the base optimizers, in
   // the order that they are specified.
   repeated string optimizers = 100;
+
+  // Message to describe custom graph optimizer and its parameters
+  message CustomGraphOptimizer {
+    string name = 1;
+    map<string, AttrValue> parameter_map = 2;
+  }
+
+  // list of CustomGraphOptimizers to apply.
+  repeated CustomGraphOptimizer custom_optimizers = 200;
 }
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 706968d347..0ca7d8475f 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 7
+#define TF_MINOR_VERSION 8
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX ""
+#define TF_VERSION_SUFFIX "-rc0"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/memmapped_file_system.cc b/tensorflow/core/util/memmapped_file_system.cc
index 1fa6b8bec0..d3439cbc93 100644
--- a/tensorflow/core/util/memmapped_file_system.cc
+++ b/tensorflow/core/util/memmapped_file_system.cc
@@ -185,7 +185,7 @@ const void* MemmappedFileSystem::GetMemoryWithOffset(uint64 offset) const {
   return reinterpret_cast<const uint8*>(mapped_memory_->data()) + offset;
 }
 
-#if defined(COMPILER_MSVC)
+#if defined(_MSC_VER)
 constexpr char* MemmappedFileSystem::kMemmappedPackagePrefix;
 constexpr char* MemmappedFileSystem::kMemmappedPackageDefaultGraphDef;
 #else
diff --git a/tensorflow/core/util/memmapped_file_system.h b/tensorflow/core/util/memmapped_file_system.h
index 76cc4911f5..958e23d28e 100644
--- a/tensorflow/core/util/memmapped_file_system.h
+++ b/tensorflow/core/util/memmapped_file_system.h
@@ -53,7 +53,7 @@ class MemmappedFileSystem : public FileSystem {
  public:
   // Memmapped regions use this prefix to distinguish from
   // the filesystem.
-#if defined(COMPILER_MSVC)
+#if defined(_MSC_VER)
   static constexpr char* kMemmappedPackagePrefix =
 #else
   static constexpr char kMemmappedPackagePrefix[] =
@@ -61,7 +61,7 @@ class MemmappedFileSystem : public FileSystem {
       "memmapped_package://";
 
 // The default graphdef in the package.
-#if defined(COMPILER_MSVC)
+#if defined(_MSC_VER)
   static constexpr char* kMemmappedPackageDefaultGraphDef =
 #else
   static constexpr char kMemmappedPackageDefaultGraphDef[] =
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 9f58e40d94..bc6d2d77a4 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -45,6 +45,10 @@ using mkldnn::primitive;
 using mkldnn::reorder;
 #endif
 
+#ifdef _WIN32
+typedef unsigned int uint;
+#endif
+
 // The file contains a number of utility classes and functions used by MKL
 // enabled kernels
 
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md
index f3db5857ae..74fe4a323a 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md
@@ -6,43 +6,39 @@ Monte Carlo integration and helpers.
 ## Background
 
 Monte Carlo integration refers to the practice of estimating an expectation with
-a sample mean.  For example, given random variable `Z in \\(R^k\\)` with density `p`,
+a sample mean.  For example, given random variable Z in \\(R^k\\) with density `p`,
 the expectation of function `f` can be approximated like:
 
-```
 $$E_p[f(Z)] = \int f(z) p(z) dz$$
 $$          ~ S_n
           := n^{-1} \sum_{i=1}^n f(z_i),  z_i\ iid\ samples\ from\ p.$$
-```
 
-If `\\(E_p[|f(Z)|] < infinity\\)`, then `\\(S_n\\) --> \\(E_p[f(Z)]\\)` by the strong law of large
-numbers.  If `\\(E_p[f(Z)^2] < infinity\\)`, then `\\(S_n\\)` is asymptotically normal with
-variance `\\(Var[f(Z)] / n\\)`.
+If \\(E_p[|f(Z)|] < infinity\\), then \\(S_n\\) --> \\(E_p[f(Z)]\\) by the strong law of large
+numbers.  If \\(E_p[f(Z)^2] < infinity\\), then \\(S_n\\) is asymptotically normal with
+variance \\(Var[f(Z)] / n\\).
 
 Practitioners of Bayesian statistics often find themselves wanting to estimate
-`\\(E_p[f(Z)]\\)` when the distribution `p` is known only up to a constant.  For
+\\(E_p[f(Z)]\\) when the distribution `p` is known only up to a constant.  For
 example, the joint distribution `p(z, x)` may be known, but the evidence
-`\\(p(x) = \int p(z, x) dz\\)` may be intractable.  In that case, a parameterized
-distribution family `\\(q_\lambda(z)\\)` may be chosen, and the optimal `\\(\lambda\\)` is the
-one minimizing the KL divergence between `\\(q_\lambda(z)\\)` and
-`\\(p(z | x)\\)`.  We only know `p(z, x)`, but that is sufficient to find `\\(\lambda\\)`.
+\\(p(x) = \int p(z, x) dz\\) may be intractable.  In that case, a parameterized
+distribution family \\(q_\lambda(z)\\) may be chosen, and the optimal \\(\lambda\\) is the
+one minimizing the KL divergence between \\(q_\lambda(z)\\) and
+\\(p(z | x)\\).  We only know `p(z, x)`, but that is sufficient to find \\(\lambda\\).
 
 
 ## Log-space evaluation and subtracting the maximum
 
 Care must be taken when the random variable lives in a high dimensional space.
-For example, the naive importance sample estimate `\\(E_q[f(Z) p(Z) / q(Z)]\\)`
-involves the ratio of two terms `\\(p(Z) / q(Z)\\)`, each of which must have tails
-dropping off faster than `\\(O(|z|^{-(k + 1)})\\)` in order to have finite integral.
+For example, the naive importance sample estimate \\(E_q[f(Z) p(Z) / q(Z)]\\)
+involves the ratio of two terms \\(p(Z) / q(Z)\\), each of which must have tails
+dropping off faster than \\(O(|z|^{-(k + 1)})\\) in order to have finite integral.
 This ratio would often be zero or infinity up to numerical precision.
 
 For that reason, we write
 
-```
 $$Log E_q[ f(Z) p(Z) / q(Z) ]$$
 $$   = Log E_q[ \exp\{Log[f(Z)] + Log[p(Z)] - Log[q(Z)] - C\} ] + C,$$  where
 $$C := Max[ Log[f(Z)] + Log[p(Z)] - Log[q(Z)] ].$$
-```
 
 The maximum value of the exponentiated term will be 0.0, and the expectation
 can be evaluated in a stable manner.
diff --git a/tensorflow/docs_src/community/documentation.md b/tensorflow/docs_src/community/documentation.md
index d5bc7a5a7a..8639656d07 100644
--- a/tensorflow/docs_src/community/documentation.md
+++ b/tensorflow/docs_src/community/documentation.md
@@ -402,24 +402,24 @@ types and default values.
 
 For example:
 
-    ```c++
-    REGISTER_OP("PngDecode")
-      .Input("contents: string")
-      .Attr("channels: int = 0")
-      .Output("image: uint8")
-      .Doc(R"doc(
-    Decodes the contents of a PNG file into a uint8 tensor.
-
-    contents: PNG file contents.
-    channels: Number of color channels, or 0 to autodetect based on the input.
-      Must be 0 for autodetect, 1 for grayscale, 3 for RGB, or 4 for RGBA.
-      If the input has a different number of channels, it will be transformed
-      accordingly.
-    image:= A 3-D uint8 tensor of shape `[height, width, channels]`.
-      If `channels` is 0, the last dimension is determined
-      from the png contents.
-    )doc");
-    ```
+```c++
+REGISTER_OP("PngDecode")
+  .Input("contents: string")
+  .Attr("channels: int = 0")
+  .Output("image: uint8")
+  .Doc(R"doc(
+Decodes the contents of a PNG file into a uint8 tensor.
+
+contents: PNG file contents.
+channels: Number of color channels, or 0 to autodetect based on the input.
+  Must be 0 for autodetect, 1 for grayscale, 3 for RGB, or 4 for RGBA.
+  If the input has a different number of channels, it will be transformed
+  accordingly.
+image:= A 3-D uint8 tensor of shape `[height, width, channels]`.
+  If `channels` is 0, the last dimension is determined
+  from the png contents.
+)doc");
+```
 
 Results in this piece of Markdown:
 
@@ -429,12 +429,12 @@ Results in this piece of Markdown:
 
     #### Args:
 
-    *  <b>contents</b>: A string Tensor. PNG file contents.
-    *  <b>channels</b>: An optional int. Defaults to 0.
+    *  **contents**: A string Tensor. PNG file contents.
+    *  **channels**: An optional int. Defaults to 0.
        Number of color channels, or 0 to autodetect based on the input.
        Must be 0 for autodetect, 1 for grayscale, 3 for RGB, or 4 for RGBA.  If the
        input has a different number of channels, it will be transformed accordingly.
-    *  <b>name</b>: A name for the operation (optional).
+    *  **name**: A name for the operation (optional).
 
     #### Returns:
     A 3-D uint8 tensor of shape `[height, width, channels]`.  If `channels` is
@@ -442,7 +442,7 @@ Results in this piece of Markdown:
 
 Much of the argument description is added automatically. In particular, the doc
 generator automatically adds the name and type of all inputs, attrs, and
-outputs. In the above example, `<b>contents</b>: A string Tensor.` was added
+outputs. In the above example, `contents: A string Tensor.` was added
 automatically. You should write your additional text to flow naturally after
 that description.
 
@@ -664,10 +664,10 @@ This generates the following Args section in
 
     #### Args:
 
-    * <b>`contents`</b>: A `Tensor` of type `string`. 0-D.  The PNG-encoded
+    * **`contents`**: A `Tensor` of type `string`. 0-D.  The PNG-encoded
       image.
-    * <b>`channels`</b>: An optional `int`. Defaults to `0`. Number of color
+    * **`channels`**: An optional `int`. Defaults to `0`. Number of color
       channels for the decoded image.
-    * <b>`dtype`</b>: An optional `tf.DType` from: `tf.uint8,
+    * **`dtype`**: An optional `tf.DType` from: `tf.uint8,
       tf.uint16`. Defaults to `tf.uint 8`.
-    * <b>`name`</b>: A name for the operation (optional).
+    * **`name`**: A name for the operation (optional).
diff --git a/tensorflow/docs_src/deploy/s3.md b/tensorflow/docs_src/deploy/s3.md
index 38f8428634..ef3b030e32 100644
--- a/tensorflow/docs_src/deploy/s3.md
+++ b/tensorflow/docs_src/deploy/s3.md
@@ -1,22 +1,13 @@
 # How to run TensorFlow on S3
 
-This document describes how to run TensorFlow on S3 file system.
+Tensorflow supports reading and writing data to S3. S3 is an object storage API which is nearly ubiquitious, and can help in situations where data must accessed by multiple actors, such as in distributed training.
 
-## S3
+This document guides you through the required setup, and provides examples on usage.
 
-We assume that you are familiar with @{$reading_data$reading data}.
-
-To use S3 with TensorFlow, change the file paths you use to read and write
-data to an S3 path. For example:
-
-```python
-filenames = ["s3://bucketname/path/to/file1.tfrecord",
-             "s3://bucketname/path/to/file2.tfrecord"]
-dataset = tf.data.TFRecordDataset(filenames)
-```
+## Configuration
 
 When reading or writing data on S3 with your TensorFlow program, the behavior
-could be controlled by various environmental variables:
+can be controlled by various environmental variables:
 
 *   **AWS_REGION**: By default, regional endpoint is used for S3, with region
     controlled by `AWS_REGION`. If `AWS_REGION` is not specified, then
@@ -28,7 +19,7 @@ could be controlled by various environmental variables:
 *   **S3_VERIFY_SSL**: If HTTPS is used, SSL verification could be disabled
     with `S3_VERIFY_SSL=0`.
 
-To read or write objects in a bucket that is no publicly accessible,
+To read or write objects in a bucket that is not publicly accessible,
 AWS credentials must be provided through one of the following methods:
 
 *   Set credentials in the AWS credentials profile file on the local system,
@@ -38,3 +29,65 @@ AWS credentials must be provided through one of the following methods:
     variables.
 *   If TensorFlow is deployed on an EC2 instance, specify an IAM role and then
     give the EC2 instance access to that role.
+
+## Example Setup
+
+Using the above information, we can configure Tensorflow to communicate to an S3 endpoint by setting the following environment variables:
+
+```bash
+AWS_ACCESS_KEY_ID=XXXXX                 # Credentials only needed if connecting to a private endpoint
+AWS_SECRET_ACCESS_KEY=XXXXX
+AWS_REGION=us-east-1                    # Region for the S3 bucket, this is not always needed. Default is us-east-1.
+S3_ENDPOINT=s3.us-east-1.amazonaws.com  # The S3 API Endpoint to connect to. This is specified in a HOST:PORT format.
+S3_USE_HTTPS=1                          # Whether or not to use HTTPS. Disable with 0.
+S3_VERIFY_SSL=1                         # If HTTPS is used, conterols if SSL should be enabled. Disable with 0.
+```
+
+## Usage
+
+Once setup is completed, Tensorflow can interact with S3 in a variety of ways. Anywhere there is a Tensorflow IO function, an S3 URL can be used.
+
+### Smoke Test
+
+To test your setup, stat a file:
+
+```python
+from tensorflow.python.lib.io import file_io
+print file_io.stat('s3://bucketname/path/')
+```
+
+You should see output similar to this:
+
+```console
+<tensorflow.python.pywrap_tensorflow_internal.FileStatistics; proxy of <Swig Object of type 'tensorflow::FileStatistics *' at 0x10c2171b0> >
+```
+
+### Reading Data
+
+When @{$reading_data$reading data}, change the file paths you use to read and write
+data to an S3 path. For example:
+
+```python
+filenames = ["s3://bucketname/path/to/file1.tfrecord",
+             "s3://bucketname/path/to/file2.tfrecord"]
+dataset = tf.data.TFRecordDataset(filenames)
+```
+
+### Tensorflow Tools
+
+Many Tensorflow tools, such as Tensorboard or model serving, can also take S3 URLS as arguments:
+
+```bash
+tensorboard --logdir s3://bucketname/path/to/model/
+tensorflow_model_server --port=9000 --model_name=model --model_base_path=s3://bucketname/path/to/model/export/
+```
+
+This enables an end to end workflow using S3 for all data needs.
+
+## S3 Endpoint Implementations
+
+S3 was invented by Amazon, but the S3 API has spread in popularity and has several implementations. The following implementations have passed basic compatibility tests:
+
+* [Amazon S3](https://aws.amazon.com/s3/)
+* [Google Storage](https://cloud.google.com/storage/docs/interoperability)
+* [Minio](https://www.minio.io/kubernetes.html)(Standalone mode only)
diff --git a/tensorflow/docs_src/extend/language_bindings.md b/tensorflow/docs_src/extend/language_bindings.md
index b9fd72978d..9a968d365b 100644
--- a/tensorflow/docs_src/extend/language_bindings.md
+++ b/tensorflow/docs_src/extend/language_bindings.md
@@ -112,11 +112,11 @@ There are a few ways to get a list of the `OpDef`s for the registered ops:
     to interpret the `OpDef` messages.
 -   The C++ function `OpRegistry::Global()->GetRegisteredOps()` returns the same
     list of all registered `OpDef`s (defined in
-    [`tensorflow/core/framework/op.h`]). This can be used to write the generator
+    [`tensorflow/core/framework/op.h`](https://www.tensorflow.org/code/tensorflow/core/framework/op.h)). This can be used to write the generator
     in C++ (particularly useful for languages that do not have protocol buffer
     support).
 -   The ASCII-serialized version of that list is periodically checked in to
-    [`tensorflow/core/ops/ops.pbtxt`] by an automated process.
+    [`tensorflow/core/ops/ops.pbtxt`](https://www.tensorflow.org/code/tensorflow/core/ops/ops.pbtxt) by an automated process.
 
 The `OpDef` specifies the following:
 
@@ -159,7 +159,7 @@ between the generated code and the `OpDef`s checked into the repository, but is
 useful for languages where code is expected to be generated ahead of time like
 `go get` for Go and `cargo ops` for Rust. At the other end of the spectrum, for
 some languages the code could be generated dynamically from
-[`tensorflow/core/ops/ops.pbtxt`].
+[`tensorflow/core/ops/ops.pbtxt`](https://www.tensorflow.org/code/tensorflow/core/ops/ops.pbtxt).
 
 #### Handling Constants
 
@@ -229,6 +229,3 @@ and "while") is not available in languages other than Python. This will be
 updated when the [C API] provides necessary support.
 
 [C API]: https://www.tensorflow.org/code/tensorflow/c/c_api.h
-[`tensorflow/core/ops/ops.pbtxt`]: https://www.tensorflow.org/code/tensorflow/core/ops/ops.pbtxt
-[`tensorflow/python/BUILD`]: https://www.tensorflow.org/code/tensorflow/python/BUILD
-[`tensorflow/core/framework/op.h`]: https://www.tensorflow.org/code/tensorflow/core/framework/op.h
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 274413e294..995b8ae666 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.7.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0-rc0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 1a0956634d..2938a8f7ee 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.7.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0-rc0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index cdde45a6f4..05604d95c5 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.7.0</version>
+  <version>1.8.0-rc0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.7.0</version>
+                 <version>1.8.0-rc0</version>
                </dependency>
              </dependencies>
          </project>
@@ -93,6 +93,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
 
               // Execute the "MyConst" operation in a Session.
               try (Session s = new Session(g);
+                   // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks.
                    Tensor output = s.runner().fetch("MyConst").run().get(0)) {
                 System.out.println(new String(output.bytesValue(), "UTF-8"));
               }
@@ -123,12 +124,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.7.0</version>
+  <version>1.8.0-rc0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.7.0</version>
+  <version>1.8.0-rc0</version>
 </dependency>
 ```
 
@@ -147,7 +148,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -166,7 +167,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.7.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0-rc0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -174,10 +175,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.7.0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0-rc0.zip).
   3. Extract this .zip file.
 
 
@@ -207,6 +208,7 @@ public class HelloTF {
 
       // Execute the "MyConst" operation in a Session.
       try (Session s = new Session(g);
+           // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks.
            Tensor output = s.runner().fetch("MyConst").run().get(0)) {
         System.out.println(new String(output.bytesValue(), "UTF-8"));
       }
@@ -225,7 +227,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.7.0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.8.0-rc0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -239,11 +241,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.7.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.7.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 04e4242b0f..1a349f5412 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -65,16 +65,38 @@ must be installed on your system:
     <pre>
     $ <b>sudo apt-get install libcupti-dev</b>
     </pre>
+
   * **[OPTIONAL]**  For optimized inferencing performance, you can also install
-    NVIDIA TensorRT 3.0. For details, see
-    [NVIDIA's TensorRT documentation](http://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html#installing-tar).
-    Only steps 1-4 in the TensorRT Tar File installation instructions are
-    required for compatibility with TensorFlow; the Python package installation
-    in steps 5 and 6 can be omitted. Detailed installation instructions can be found at [package documentataion](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/tensorrt#installing-tensorrt-304)
+    **NVIDIA TensorRT 3.0**. The minimal set of TensorRT runtime components needed
+    for use with the pre-built `tensorflow-gpu` package can be installed as follows:
+
+    <pre>
+    $ <b>wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/nvinfer-runtime-trt-repo-ubuntu1404-3.0.4-ga-cuda9.0_1.0-1_amd64.deb</b>
+    $ <b>sudo dpkg -i nvinfer-runtime-trt-repo-ubuntu1404-3.0.4-ga-cuda9.0_1.0-1_amd64.deb</b>
+    $ <b>sudo apt-get update</b>
+    $ <b>sudo apt-get install -y --allow-downgrades libnvinfer-dev libcudnn7-dev=7.0.5.15-1+cuda9.0 libcudnn7=7.0.5.15-1+cuda9.0</b>
+    </pre>
 
     **IMPORTANT:** For compatibility with the pre-built `tensorflow-gpu`
-    package, please use the Ubuntu **14.04** tar file package of TensorRT
-    even when installing onto an Ubuntu 16.04 system.   
+    package, please use the Ubuntu **14.04** package of TensorRT as shown above,
+    even when installing onto an Ubuntu 16.04 system.<br/>
+    <br/>
+    To build the TensorFlow-TensorRT integration module from source rather than
+    using pre-built binaries, see the [module documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/tensorrt#using-tensorrt-in-tensorflow).
+    For detailed TensorRT installation instructions, see [NVIDIA's TensorRT documentation](http://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html).<br/>
+    <br/>
+    To avoid cuDNN version conflicts during later system upgrades, you can hold
+    the cuDNN version at 7.0.5:
+
+    <pre>
+    $ <b> sudo apt-mark hold libcudnn7 libcudnn7-dev</b>
+    </pre>
+
+    To later allow upgrades, you can remove the hold:
+
+    <pre>
+    $ <b> sudo apt-mark unhold libcudnn7 libcudnn7-dev</b>
+    </pre>
 
 If you have an earlier version of the preceding packages, please upgrade to
 the specified versions. If upgrading is not possible, then you may still run
@@ -194,7 +216,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -299,7 +321,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -485,7 +507,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -659,14 +681,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -678,14 +700,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -697,14 +719,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -716,14 +738,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index b3e9616a05..a237d1af54 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -524,7 +524,7 @@ The value you specify depends on your Python version.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py2-none-any.whl
 </pre>
 
 
@@ -532,5 +532,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 26287aa3a1..b186758653 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -354,10 +354,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.7.0 on Linux:
+for TensorFlow 1.8.0rc0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.7.0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0rc0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -454,6 +454,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.7.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>N/A</td><td>N/A</td></tr>
@@ -475,6 +477,7 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
@@ -490,6 +493,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.8.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.7.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/docs_src/mobile/android_build.md b/tensorflow/docs_src/mobile/android_build.md
index 08a5fbe41c..c35530061d 100644
--- a/tensorflow/docs_src/mobile/android_build.md
+++ b/tensorflow/docs_src/mobile/android_build.md
@@ -51,7 +51,8 @@ If you haven't already, do the following two things:
         // set to 'bazel', 'cmake', 'makefile', 'none'
         def nativeBuildSystem = 'none'
 
-4. Click the Run button (the green arrow) or use **Run -> Run 'android'** from the top menu.
+4. Click the *Run* button (the green arrow) or select *Run > Run 'android'* from the
+    top menu. You may need to rebuild the project using *Build > Rebuild Project*.
 
     If it asks you to use Instant Run, click **Proceed Without Instant Run**.
 
diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index 411889cb1c..2fea02d861 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -110,7 +110,7 @@ we've added a separate rewrite for the *eval graph*:
 
 ```
 # Build eval model
-logits = tf.nn.softmax_cross_entropy_with_logits(...)
+logits = tf.nn.softmax_cross_entropy_with_logits_v2(...)
 
 # Call the eval rewrite which rewrites the graph in-place with
 # FakeQuantization nodes and fold batchnorm for eval.
diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
index f5a0eb0a20..f7817b06d4 100644
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -400,7 +400,7 @@ diff = -(y_ * tf.log(y))
 to the built-in, numerically-stable implementation of softmax cross-entropy:
 
 ```python
-diff = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=logits)
+diff = tf.losses.softmax_cross_entropy(labels=y_, logits=logits)
 ```
 
 Rerun with the `--debug` flag as follows:
diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md
index aa72cae766..f0dd8def17 100644
--- a/tensorflow/docs_src/programmers_guide/graphs.md
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@@ -210,7 +210,7 @@ with tf.device("/device:GPU:0"):
   # Operations created in this context will be pinned to the GPU.
   result = tf.matmul(weights, img)
 ```
-If you are deploying TensorFlow in a @{$deploy/distributed$typical distributed configuration},
+If you are deploying TensorFlow in a @{$distributed$typical distributed configuration},
 you might specify the job name and task ID to place variables on
 a task in the parameter server job (`"/job:ps"`), and the other operations on
 task in the worker job (`"/job:worker"`):
@@ -362,7 +362,7 @@ operations that are needed to compute the result.
 
 @{tf.Session.run} requires you to specify a list of **fetches**, which determine
 the return values, and may be a @{tf.Operation}, a @{tf.Tensor}, or
-a [tensor-like type](#tensor-like-objects) such as @{tf.Variable}. These fetches
+a [tensor-like type](#tensor-like_objects) such as @{tf.Variable}. These fetches
 determine what **subgraph** of the overall @{tf.Graph} must be executed to
 produce the result: this is the subgraph that contains all operations named in
 the fetch list, plus all operations whose outputs are used to compute the value
@@ -505,7 +505,7 @@ multiple graphs in the same process.
 As noted above, TensorFlow provides a "default graph" that is implicitly passed
 to all API functions in the same context. For many applications, a single graph
 is sufficient. However, TensorFlow also provides methods for manipulating
-the default graph, which can be useful in more advanced used cases. For example:
+the default graph, which can be useful in more advanced use cases. For example:
 
 * A @{tf.Graph} defines the namespace for @{tf.Operation} objects: each
   operation in a single graph must have a unique name. TensorFlow will
diff --git a/tensorflow/docs_src/programmers_guide/saved_model.md b/tensorflow/docs_src/programmers_guide/saved_model.md
index 55ee42dd64..c6ef87c54a 100644
--- a/tensorflow/docs_src/programmers_guide/saved_model.md
+++ b/tensorflow/docs_src/programmers_guide/saved_model.md
@@ -485,31 +485,7 @@ portion of the signature.  That is, when writing a
 to expect and how to map them to your model's expected inputs.
 By contrast, the *output* portion of the signature is determined by the model.
 
-
-### Perform the export
-
-To export your trained Estimator, call
-@{tf.estimator.Estimator.export_savedmodel} with the export base path and
-the `serving_input_receiver_fn`.
-
-```py
-estimator.export_savedmodel(export_dir_base, serving_input_receiver_fn,
-                            strip_default_attrs=True)
-```
-
-This method builds a new graph by first calling the
-`serving_input_receiver_fn()` to obtain feature `Tensor`s, and then calling
-this `Estimator`'s `model_fn()` to generate the model graph based on those
-features. It starts a fresh `Session`, and, by default, restores the most recent
-checkpoint into it.  (A different checkpoint may be passed, if needed.)
-Finally it creates a time-stamped export directory below the given
-`export_dir_base` (i.e., `export_dir_base/<timestamp>`), and writes a
-SavedModel into it containing a single `MetaGraphDef` saved from this
-Session.
-
-> Note: It is your responsibility to garbage-collect old exports.
-> Otherwise, successive exports will accumulate under `export_dir_base`.
-
+<a name="specify_outputs"></a>
 ### Specify the outputs of a custom model
 
 When writing a custom `model_fn`, you must populate the `export_outputs` element
@@ -541,6 +517,30 @@ using [`signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`](https://www.tens
 indicating which `SignatureDef` will be served when an inference request
 does not specify one.
 
+<a name="perform_export"></a>
+### Perform the export
+
+To export your trained Estimator, call
+@{tf.estimator.Estimator.export_savedmodel} with the export base path and
+the `serving_input_receiver_fn`.
+
+```py
+estimator.export_savedmodel(export_dir_base, serving_input_receiver_fn,
+                            strip_default_attrs=True)
+```
+
+This method builds a new graph by first calling the
+`serving_input_receiver_fn()` to obtain feature `Tensor`s, and then calling
+this `Estimator`'s `model_fn()` to generate the model graph based on those
+features. It starts a fresh `Session`, and, by default, restores the most recent
+checkpoint into it.  (A different checkpoint may be passed, if needed.)
+Finally it creates a time-stamped export directory below the given
+`export_dir_base` (i.e., `export_dir_base/<timestamp>`), and writes a
+SavedModel into it containing a single `MetaGraphDef` saved from this
+Session.
+
+> Note: It is your responsibility to garbage-collect old exports.
+> Otherwise, successive exports will accumulate under `export_dir_base`.
 
 ### Serve the exported model locally
 
diff --git a/tensorflow/docs_src/programmers_guide/using_tpu.md b/tensorflow/docs_src/programmers_guide/using_tpu.md
index cb0d86fc4c..5e3e49d434 100644
--- a/tensorflow/docs_src/programmers_guide/using_tpu.md
+++ b/tensorflow/docs_src/programmers_guide/using_tpu.md
@@ -280,8 +280,8 @@ Where `params['batch-size']` will contain the batch size.
 ### Static shapes and batch size
 
 The input pipeline generated by your `input_fn` is run on CPU. So it is mostly
-free strict static shape requirements imposed by the XLA/TPU environment. The
-one requirement is that the batches of data fed from your input pipeline to
+free from the strict static shape requirements imposed by the XLA/TPU environment.
+The one requirement is that the batches of data fed from your input pipeline to
 the TPU have a static shape, as determined by the standard TensorFlow shape
 inference algorithm. Intermediate tensors are free to have a dynamic shapes.
 If shape inference has failed, but the shape is known it is possible to
diff --git a/tensorflow/docs_src/tutorials/audio_recognition.md b/tensorflow/docs_src/tutorials/audio_recognition.md
index 7d79f433c4..372ab47df7 100644
--- a/tensorflow/docs_src/tutorials/audio_recognition.md
+++ b/tensorflow/docs_src/tutorials/audio_recognition.md
@@ -280,7 +280,7 @@ tool:
 ```
 bazel run tensorflow/examples/wav_to_spectrogram:wav_to_spectrogram -- \
 --input_wav=/tmp/speech_dataset/happy/ab00c4b2_nohash_0.wav \
---output_png=/tmp/spectrogram.png
+--output_image=/tmp/spectrogram.png
 ```
 
 If you open up `/tmp/spectrogram.png` you should see something like this:
diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md
index cadaec391d..37cd2bb139 100644
--- a/tensorflow/docs_src/tutorials/layers.md
+++ b/tensorflow/docs_src/tutorials/layers.md
@@ -192,8 +192,7 @@ dive deeper into the `tf.layers` code used to create each layer, as well as how
 to calculate loss, configure the training op, and generate predictions. If
 you're already experienced with CNNs and @{$get_started/custom_estimators$TensorFlow `Estimator`s},
 and find the above code intuitive, you may want to skim these sections or just
-skip ahead to ["Training and Evaluating the CNN MNIST
-Classifier"](#training_and_evaluating_the_cnn_mnist_classifier).
+skip ahead to ["Training and Evaluating the CNN MNIST Classifier"](#train_eval_mnist).
 
 ### Input Layer
 
@@ -536,8 +535,9 @@ if mode == tf.estimator.ModeKeys.TRAIN:
 ```
 
 > Note: For a more in-depth look at configuring training ops for Estimator model
-> functions, see @{$get_started/custom_estimators#defining_the_training_op_for_the_model$"Defining the training op for the model"} 
-> in the @{$get_started/custom_estimators$"Creating Estimators in tf.estimator."} tutorial.
+> functions, see @{$get_started/custom_estimators#defining-the-training-op-for-the-model$"Defining the training op for the model"}
+> in the @{$get_started/custom_estimators$"Creating Estimations in tf.estimator"} tutorial.
+
 
 ### Add evaluation metrics
 
@@ -552,7 +552,8 @@ return tf.estimator.EstimatorSpec(
     mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
 ```
 
-## Training and Evaluating the CNN MNIST Classifier {#training_and_evaluating_the_cnn_mnist_classifier}
+<a id="train_eval_mnist"></a>
+## Training and Evaluating the CNN MNIST Classifier
 
 We've coded our MNIST CNN model function; now we're ready to train and evaluate
 it.
@@ -612,9 +613,9 @@ following to `main()`:
 
 ```python
 # Set up logging for predictions
-  tensors_to_log = {"probabilities": "softmax_tensor"}
-  logging_hook = tf.train.LoggingTensorHook(
-      tensors=tensors_to_log, every_n_iter=50)
+tensors_to_log = {"probabilities": "softmax_tensor"}
+logging_hook = tf.train.LoggingTensorHook(
+    tensors=tensors_to_log, every_n_iter=50)
 ```
 
 We store a dict of the tensors we want to log in `tensors_to_log`. Each key is a
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index 14ae7fbf35..b09ee99768 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -224,7 +224,7 @@ with graph.as_default():
     optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
 
   # Compute the cosine similarity between minibatch examples and all embeddings.
-  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
+  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
   normalized_embeddings = embeddings / norm
   valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                             valid_dataset)
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index ec7d9dcc4f..c31ca8b67a 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -21159,7 +21159,7 @@ func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 //    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
 //
 // The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
 // Each element must be in the range `[0, 255]` (It represents the value of a
 // pixel in the output image).  Non-finite values in the input tensor are
 // replaced by this tensor in the output image.  The default value is the color
diff --git a/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java b/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
index 489e95c310..3948991c84 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
@@ -101,6 +101,7 @@ public class LabelImage {
                   b.constant("mean", mean)),
               b.constant("scale", scale));
       try (Session s = new Session(g)) {
+        // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks.
         return s.runner().fetch(output.op().name()).run().get(0).expect(Float.class);
       }
     }
@@ -110,6 +111,7 @@ public class LabelImage {
     try (Graph g = new Graph()) {
       g.importGraphDef(graphDef);
       try (Session s = new Session(g);
+          // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks.
           Tensor<Float> result =
               s.runner().feed("input", image).fetch("output").run().get(0).expect(Float.class)) {
         final long[] rshape = result.shape();
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 9dc03d7cdb..8e7f0cadad 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1946,7 +1946,8 @@ py_library(
         ":array_ops",
         ":constant_op",
         ":dtypes",
-        ":linalg_ops",
+        ":linalg_ops_gen",
+        ":linalg_ops_impl",
         ":math_ops",
         ":nn_ops",
         ":random_ops",
@@ -1997,7 +1998,22 @@ py_library(
         ":array_ops",
         ":dtypes",
         ":framework_ops",
+        ":functional_ops",
         ":linalg_ops_gen",
+        ":linalg_ops_impl",
+        ":math_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "linalg_ops_impl",
+    srcs = ["ops/linalg_ops_impl.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":dtypes",
+        ":framework_ops",
         ":math_ops",
         "//third_party/py/numpy",
     ],
@@ -3493,6 +3509,7 @@ tf_py_wrap_cc(
         "//tensorflow/core/profiler/internal:print_model_analysis",
         "//tensorflow/tools/graph_transforms:transform_graph_lib",
         "//tensorflow/python/eager:pywrap_tfe_lib",
+        "//tensorflow/python/eager:python_eager_op_gen",
         "//util/python:python_headers",
     ] + (tf_additional_lib_deps() +
          tf_additional_plugin_deps() +
diff --git a/tensorflow/python/debug/cli/readline_ui.py b/tensorflow/python/debug/cli/readline_ui.py
index 151638789f..3296e45d07 100644
--- a/tensorflow/python/debug/cli/readline_ui.py
+++ b/tensorflow/python/debug/cli/readline_ui.py
@@ -19,6 +19,8 @@ from __future__ import print_function
 
 import readline
 
+import six
+
 from tensorflow.python.debug.cli import base_ui
 from tensorflow.python.debug.cli import debugger_cli_common
 
@@ -39,11 +41,7 @@ class ReadlineUI(base_ui.BaseUI):
     readline.set_completer(self._readline_complete)
     readline.parse_and_bind("tab: complete")
 
-    # For Python 2-3 compatibility.
-    try:
-      self._input = raw_input
-    except NameError:
-      self._input = input
+    self._input = six.moves.input
 
   def _readline_complete(self, text, state):
     context, prefix, except_last_word = self._analyze_tab_complete_input(text)
diff --git a/tensorflow/python/debug/wrappers/grpc_wrapper.py b/tensorflow/python/debug/wrappers/grpc_wrapper.py
index fb9494f576..1f9c8fa5a9 100644
--- a/tensorflow/python/debug/wrappers/grpc_wrapper.py
+++ b/tensorflow/python/debug/wrappers/grpc_wrapper.py
@@ -21,6 +21,8 @@ import signal
 import sys
 import traceback
 
+import six
+
 # Google-internal import(s).
 from tensorflow.python.debug.lib import common
 from tensorflow.python.debug.wrappers import framework
@@ -140,14 +142,9 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
 
 
 def _signal_handler(unused_signal, unused_frame):
-  try:
-    input_func = raw_input
-  except NameError:
-    # Python 3 does not have raw_input.
-    input_func = input
-
   while True:
-    response = input_func("\nSIGINT received. Quit program? (Y/n): ").strip()
+    response = six.moves.input(
+        "\nSIGINT received. Quit program? (Y/n): ").strip()
     if response in ("", "Y", "y"):
       sys.exit(0)
     elif response in ("N", "n"):
diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py
index 6705cd31e2..5e4604fda4 100644
--- a/tensorflow/python/debug/wrappers/hooks.py
+++ b/tensorflow/python/debug/wrappers/hooks.py
@@ -31,15 +31,18 @@ from tensorflow.python.training import session_run_hook
 class LocalCLIDebugHook(session_run_hook.SessionRunHook):
   """Command-line-interface debugger hook.
 
-  Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
-  `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
+  Can be used as a hook for `tf.train.MonitoredSession`s and
+  `tf.estimator.Estimator`s. Provides a substitute for
+  `tfdbg.LocalCLIDebugWrapperSession` in cases where the session is not directly
+  available.
   """
 
   def __init__(self, ui_type="curses", dump_root=None, thread_name_filter=None):
     """Create a local debugger command-line interface (CLI) hook.
 
     Args:
-      ui_type: (str) user-interface type.
+      ui_type: (`str`) requested user-interface type. Currently supported:
+        (curses | readline).
       dump_root: (`str`) optional path to the dump root directory. Must be a
         directory that does not exist or an empty directory. If the directory
         does not exist, it will be created by the debugger core during debug
@@ -153,8 +156,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
 class DumpingDebugHook(session_run_hook.SessionRunHook):
   """A debugger hook that dumps debug data to filesystem.
 
-  Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
-  `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
+  Can be used as a hook for `tf.train.MonitoredSession`s and
+  `tf.estimator.Estimator`s.
   """
 
   def __init__(self,
@@ -229,8 +232,8 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
   When the arguments of debug_utils.watch_graph changes, strongly consider
   changing arguments here too so that features are available to tflearn users.
 
-  Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
-  `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
+  Can be used as a hook for `tf.train.MonitoredSession`s and
+  `tf.estimator.Estimator`s.
   """
 
   def __init__(self,
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index c365ea8b4a..efa4bdf598 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -263,9 +263,12 @@ def _check_dense_labels_match_logits_and_reshape(
         if (dim1 is not None) and (dim1 != expected_labels_dimension):
           raise ValueError(
               'Mismatched label shape. '
-              'Classifier configured with n_classes=%s.  Received %s. '
-              'Suggested Fix: check your n_classes argument to the estimator '
-              'and/or the shape of your label.' %
+              'Expected labels dimension=%s.  Received %s. '
+              'Suggested Fix:'
+              'If your classifier expects one-hot encoding label,'
+              'check your n_classes argument to the estimator'
+              'and/or the shape of your label.'
+              'Otherwise, check the shape of your label.' %
               (expected_labels_dimension, dim1))
       expected_labels_shape = array_ops.concat(
           [logits_shape[:-1], [expected_labels_dimension]], axis=0)
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 351fcb6423..2f1212d5a2 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -207,7 +207,8 @@ class Estimator(object):
     else:
       self._session_config = self._config.session_config
 
-    self._device_fn = _get_replica_device_setter(self._config)
+    self._device_fn = self._config.device_fn or \
+                      _get_replica_device_setter(self._config)
 
     if model_fn is None:
       raise ValueError('model_fn must be provided to Estimator.')
@@ -716,7 +717,7 @@ class Estimator(object):
       batch_length = batch_length or value.shape[0]
       if value.shape[0] != batch_length:
         raise ValueError('Batch length of predictions should be same. %s has '
-                         'different batch length then others.' % key)
+                         'different batch length than others.' % key)
     return batch_length
 
   def _extract_keys(self, predictions, predict_keys):
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index dab442aeda..8162b249f1 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -27,11 +27,13 @@ import six
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
+from tensorflow.python.estimator import util
 from tensorflow.python.util import compat_internal
 from tensorflow.python.util.tf_export import tf_export
 
 
 _USE_DEFAULT = object()
+_VALID_DEVICE_FN_ARGS = set(['op'])
 
 # A list of the property names in RunConfig that the user is allowed to change.
 _DEFAULT_REPLACEABLE_LIST = [
@@ -44,7 +46,8 @@ _DEFAULT_REPLACEABLE_LIST = [
     'keep_checkpoint_max',
     'keep_checkpoint_every_n_hours',
     'log_step_count_steps',
-    'train_distribute'
+    'train_distribute',
+    'device_fn'
 ]
 
 _SAVE_CKPT_ERR = (
@@ -279,6 +282,11 @@ def _validate_properties(run_config):
   _validate('tf_random_seed', lambda seed: isinstance(seed, six.integer_types),
             message='tf_random_seed must be integer.')
 
+  _validate('device_fn', lambda device_fn: six.callable(device_fn) and
+            set(util.fn_args(device_fn)) == _VALID_DEVICE_FN_ARGS,
+            message='device_fn must be callable with exactly'
+                    ' one argument "op".')
+
 
 class TaskType(object):
   MASTER = 'master'
@@ -302,7 +310,8 @@ class RunConfig(object):
                keep_checkpoint_max=5,
                keep_checkpoint_every_n_hours=10000,
                log_step_count_steps=100,
-               train_distribute=None):
+               train_distribute=None,
+               device_fn=None):
     """Constructs a RunConfig.
 
     All distributed training related properties `cluster_spec`, `is_chief`,
@@ -430,6 +439,10 @@ class RunConfig(object):
         `tf.contrib.distribute.DistributionStrategy`. If specified,
         then Estimator will distribute the user's model during training,
         according to the policy specified by that strategy.
+      device_fn: A callable invoked for every `Operation` that takes the
+        `Operation` and returns the device string. If `None`, defaults to
+        the device function returned by `tf.train.replica_device_setter`
+        with round-robin strategy.
 
     Raises:
       ValueError: If both `save_checkpoints_steps` and `save_checkpoints_secs`
@@ -466,7 +479,8 @@ class RunConfig(object):
         keep_checkpoint_max=keep_checkpoint_max,
         keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
         log_step_count_steps=log_step_count_steps,
-        train_distribute=train_distribute)
+        train_distribute=train_distribute,
+        device_fn=device_fn)
 
     self._init_distributed_setting_from_environment_var(tf_config)
 
@@ -568,6 +582,16 @@ class RunConfig(object):
   def cluster_spec(self):
     return self._cluster_spec
 
+  @property
+  def device_fn(self):
+    """Returns the device_fn.
+
+    If device_fn is not `None`, it overrides the default
+    device function used in `Estimator`.
+    Otherwise the default one is used.
+    """
+    return self._device_fn
+
   @property
   def evaluation_master(self):
     return self._evaluation_master
@@ -697,7 +721,8 @@ class RunConfig(object):
       - `keep_checkpoint_max`,
       - `keep_checkpoint_every_n_hours`,
       - `log_step_count_steps`,
-      - `train_distribute`.
+      - `train_distribute`,
+      - `device_fn`.
 
     In addition, either `save_checkpoints_steps` or `save_checkpoints_secs`
     can be set (should not be both).
diff --git a/tensorflow/python/estimator/run_config_test.py b/tensorflow/python/estimator/run_config_test.py
index a3eef4c53f..c8b12605e1 100644
--- a/tensorflow/python/estimator/run_config_test.py
+++ b/tensorflow/python/estimator/run_config_test.py
@@ -42,6 +42,7 @@ _SESSION_CONFIG_ERR = 'session_config must be instance of ConfigProto'
 _KEEP_CKPT_MAX_ERR = 'keep_checkpoint_max should be >= 0'
 _KEEP_CKPT_HOURS_ERR = 'keep_checkpoint_every_n_hours should be > 0'
 _TF_RANDOM_SEED_ERR = 'tf_random_seed must be integer'
+_DEVICE_FN_ERR = 'device_fn must be callable with exactly one argument "op".'
 _ONE_CHIEF_ERR = 'The "cluster" in TF_CONFIG must have only one "chief" node.'
 _ONE_MASTER_ERR = 'The "cluster" in TF_CONFIG must have only one "master" node.'
 _INVALID_TASK_TYPE_FOR_EVAL_MASTER = (
@@ -83,6 +84,7 @@ class RunConfigTest(test.TestCase):
     self.assertEqual(5, config.keep_checkpoint_max)
     self.assertEqual(10000, config.keep_checkpoint_every_n_hours)
     self.assertIsNone(config.service)
+    self.assertIsNone(config.device_fn)
 
   def test_model_dir(self):
     empty_config = run_config_lib.RunConfig()
@@ -93,6 +95,7 @@ class RunConfigTest(test.TestCase):
 
   def test_replace_with_allowed_properties(self):
     session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+    device_fn = lambda op: "/cpu:0"
 
     config = run_config_lib.RunConfig().replace(
         tf_random_seed=11,
@@ -100,13 +103,15 @@ class RunConfigTest(test.TestCase):
         save_checkpoints_secs=14,
         session_config=session_config,
         keep_checkpoint_max=16,
-        keep_checkpoint_every_n_hours=17)
+        keep_checkpoint_every_n_hours=17,
+        device_fn=device_fn)
     self.assertEqual(11, config.tf_random_seed)
     self.assertEqual(12, config.save_summary_steps)
     self.assertEqual(14, config.save_checkpoints_secs)
     self.assertEqual(session_config, config.session_config)
     self.assertEqual(16, config.keep_checkpoint_max)
     self.assertEqual(17, config.keep_checkpoint_every_n_hours)
+    self.assertEqual(device_fn, config.device_fn)
 
   def test_replace_none_value(self):
     config = run_config_lib.RunConfig().replace(
@@ -117,7 +122,8 @@ class RunConfigTest(test.TestCase):
         save_checkpoints_steps=None,
         session_config=None,
         keep_checkpoint_max=None,
-        keep_checkpoint_every_n_hours=None)
+        keep_checkpoint_every_n_hours=None,
+        device_fn=None)
     self.assertIsNone(config.tf_random_seed)
     self.assertIsNone(config.model_dir)
     self.assertIsNone(config.save_summary_steps)
@@ -126,6 +132,7 @@ class RunConfigTest(test.TestCase):
     self.assertIsNone(config.session_config)
     self.assertIsNone(config.keep_checkpoint_max)
     self.assertIsNone(config.keep_checkpoint_every_n_hours)
+    self.assertIsNone(config.device_fn)
 
   def test_replace_with_disallowallowed_properties(self):
     config = run_config_lib.RunConfig()
@@ -166,9 +173,12 @@ class RunConfigTest(test.TestCase):
       config.replace(keep_checkpoint_every_n_hours=0)
     with self.assertRaisesRegexp(ValueError, _TF_RANDOM_SEED_ERR):
       config.replace(tf_random_seed=1.0)
+    with self.assertRaisesRegexp(ValueError, _DEVICE_FN_ERR):
+      config.replace(device_fn=lambda x, y: 0)
 
   def test_init_with_allowed_properties(self):
     session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+    device_fn = lambda op: "/cpu:0"
 
     config = run_config_lib.RunConfig(
         tf_random_seed=11,
@@ -176,13 +186,15 @@ class RunConfigTest(test.TestCase):
         save_checkpoints_secs=14,
         session_config=session_config,
         keep_checkpoint_max=16,
-        keep_checkpoint_every_n_hours=17)
+        keep_checkpoint_every_n_hours=17,
+        device_fn=device_fn)
     self.assertEqual(11, config.tf_random_seed)
     self.assertEqual(12, config.save_summary_steps)
     self.assertEqual(14, config.save_checkpoints_secs)
     self.assertEqual(session_config, config.session_config)
     self.assertEqual(16, config.keep_checkpoint_max)
     self.assertEqual(17, config.keep_checkpoint_every_n_hours)
+    self.assertEqual(device_fn, config.device_fn)
 
   def test_init_none_value(self):
     config = run_config_lib.RunConfig(
@@ -193,7 +205,8 @@ class RunConfigTest(test.TestCase):
         save_checkpoints_steps=None,
         session_config=None,
         keep_checkpoint_max=None,
-        keep_checkpoint_every_n_hours=None)
+        keep_checkpoint_every_n_hours=None,
+        device_fn=None)
     self.assertIsNone(config.tf_random_seed)
     self.assertIsNone(config.model_dir)
     self.assertIsNone(config.save_summary_steps)
@@ -202,6 +215,7 @@ class RunConfigTest(test.TestCase):
     self.assertIsNone(config.session_config)
     self.assertIsNone(config.keep_checkpoint_max)
     self.assertIsNone(config.keep_checkpoint_every_n_hours)
+    self.assertIsNone(config.device_fn)
 
   def test_init_invalid_values(self):
     with self.assertRaisesRegexp(ValueError, _MODEL_DIR_ERR):
@@ -220,6 +234,8 @@ class RunConfigTest(test.TestCase):
       run_config_lib.RunConfig(keep_checkpoint_every_n_hours=0)
     with self.assertRaisesRegexp(ValueError, _TF_RANDOM_SEED_ERR):
       run_config_lib.RunConfig(tf_random_seed=1.0)
+    with self.assertRaisesRegexp(ValueError, _DEVICE_FN_ERR):
+      run_config_lib.RunConfig(device_fn=lambda x: "/cpu:0")
 
 
 class RunConfigDistributedSettingTest(test.TestCase):
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index a7c4eabcb2..c16c3cda48 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -162,7 +162,6 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
-from tensorflow.python.util.tf_export import tf_export
 
 
 def _internal_input_layer(features,
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 807582bd7e..7f9ef53457 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -700,11 +700,13 @@ def as_dtype(type_value):
     if type_value.type == np.string_ or type_value.type == np.unicode_:
       return string
 
-  for key, val in _NP_TO_TF:
-    try:
-      if key == type_value:
-        return val
-    except TypeError as e:
-      raise TypeError("Cannot convert {} to a dtype. {}".format(type_value, e))
+  if isinstance(type_value, (type, np.dtype)):
+    for key, val in _NP_TO_TF:
+      try:
+        if key == type_value:
+          return val
+      except TypeError as e:
+        raise TypeError("Cannot convert {} to a dtype. {}".format(
+            type_value, e))
 
   raise TypeError("Cannot convert value %r to a TensorFlow DType." % type_value)
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index 910364364c..394fac6c85 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -285,7 +285,7 @@ def convert_variables_to_constants(sess,
     output_graph_def.node.extend([output_node])
 
   output_graph_def.library.CopyFrom(inference_graph.library)
-  print("Converted %d variables to const ops." % how_many_converted)
+  logging.info("Converted %d variables to const ops.", how_many_converted)
   return output_graph_def
 
 
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index b618152b02..2dafb94ba7 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -209,7 +209,7 @@ class DeviceFunctionsTest(test.TestCase):
           defun_node, 2.0, name="output_node")
 
       with session.Session() as sess:
-        init = variables.initialize_variables([variable_node])
+        init = variables.variables_initializer([variable_node])
         sess.run(init)
         output = sess.run(output_node)
         self.assertNear(4.0, output, 0.00001)
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index 535c6017f5..9a8477debb 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -58,7 +58,7 @@ def load_op_library(library_filename):
   op_list_str = py_tf.TF_GetOpList(lib_handle)
   op_list = op_def_pb2.OpList()
   op_list.ParseFromString(compat.as_bytes(op_list_str))
-  wrappers = py_tf.GetPythonWrappers(op_list_str)
+  wrappers = py_tf.GetEagerPythonWrappers(op_list_str)
 
   # Delete the library handle to release any memory held in C
   # that are no longer needed.
diff --git a/tensorflow/python/framework/python_op_gen.i b/tensorflow/python/framework/python_op_gen.i
index 26ec4e8e66..efcce2f209 100644
--- a/tensorflow/python/framework/python_op_gen.i
+++ b/tensorflow/python/framework/python_op_gen.i
@@ -16,10 +16,10 @@ limitations under the License.
 %include "tensorflow/python/platform/base.i"
 
 %{
-#include "tensorflow/python/framework/python_op_gen.h"
+#include "tensorflow/python/eager/python_eager_op_gen.h"
 %}
 
-// Input typemap for GetPythonWrappers.
+// Input typemap for GetEagerPythonWrappers.
 // Accepts a python object of 'bytes' type, and converts it to
 // a const char* pointer and size_t length. The default typemap
 // going from python bytes to const char* tries to decode the
@@ -37,5 +37,5 @@ limitations under the License.
 
 
 %ignoreall;
-%unignore tensorflow::GetPythonWrappers;
-%include "tensorflow/python/framework/python_op_gen.h"
+%unignore tensorflow::GetEagerPythonWrappers;
+%include "tensorflow/python/eager/python_eager_op_gen.h"
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index f954b9d6c7..5a8bc43727 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -1014,6 +1014,8 @@ class TensorFlowTestCase(googletest.TestCase):
       config.graph_options.optimizer_options.opt_level = -1
       config.graph_options.rewrite_options.constant_folding = (
           rewriter_config_pb2.RewriterConfig.OFF)
+      config.graph_options.rewrite_options.arithmetic_optimization = (
+          rewriter_config_pb2.RewriterConfig.OFF)
       return config
 
     if graph is None:
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 5a84b16a23..e3dd4b0bdf 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -476,7 +476,7 @@ class LayoutOptimizerTest(test.TestCase):
       random_seed.set_random_seed(0)
       x = random_ops.truncated_normal([1, 784], seed=0)
       conv = _two_layer_model(x)
-      reduce_sum = math_ops.reduce_sum(conv, axis=[1, 2], keep_dims=True)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[1, 2], keepdims=True)
       squeeze = array_ops.squeeze(reduce_sum, axis=[1, 2])
       output = array_ops.identity(squeeze)
 
@@ -506,7 +506,7 @@ class LayoutOptimizerTest(test.TestCase):
       random_seed.set_random_seed(0)
       x = random_ops.truncated_normal([1, 784], seed=0)
       conv = _two_layer_model(x)
-      reduce_sum = math_ops.reduce_sum(conv, axis=[0, 1, 2], keep_dims=True)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[0, 1, 2], keepdims=True)
       squeeze = array_ops.squeeze(reduce_sum, axis=[0, 1, 2])
       output = array_ops.identity(squeeze)
 
@@ -623,7 +623,7 @@ class LayoutOptimizerTest(test.TestCase):
       random_seed.set_random_seed(0)
       x = random_ops.truncated_normal([1, 784], seed=0)
       conv = _two_layer_model(x)
-      reduce_sum = math_ops.reduce_sum(conv, axis=[3], keep_dims=True)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[3], keepdims=True)
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
@@ -653,7 +653,7 @@ class LayoutOptimizerTest(test.TestCase):
       random_seed.set_random_seed(0)
       x = random_ops.truncated_normal([1, 784], seed=0)
       conv = _two_layer_model(x)
-      reduce_sum = math_ops.reduce_sum(conv, axis=[2], keep_dims=True)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[2], keepdims=True)
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
@@ -682,7 +682,7 @@ class LayoutOptimizerTest(test.TestCase):
       random_seed.set_random_seed(0)
       x = random_ops.truncated_normal([1, 784], seed=0)
       conv = _two_layer_model(x)
-      reduce_sum = math_ops.reduce_sum(conv, axis=[2, 3], keep_dims=True)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[2, 3], keepdims=True)
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index 81a4d2f820..449410fe08 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -3448,7 +3448,7 @@ def categorical_crossentropy(target, output, from_logits=False):
   Returns:
       Output tensor.
   """
-  # Note: nn.softmax_cross_entropy_with_logits
+  # Note: nn.softmax_cross_entropy_with_logits_v2
   # expects logits, Keras expects probabilities.
   if not from_logits:
     # scale preds so that the class probas of each sample sum to 1
@@ -3512,7 +3512,7 @@ def binary_crossentropy(target, output, from_logits=False):
   Returns:
       A tensor.
   """
-  # Note: nn.softmax_cross_entropy_with_logits
+  # Note: nn.sigmoid_cross_entropy_with_logits
   # expects logits, Keras expects probabilities.
   if not from_logits:
     # transform back to logits
diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization.py b/tensorflow/python/keras/_impl/keras/layers/normalization.py
index 5462a95d7d..c16fc07fb4 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization.py
+++ b/tensorflow/python/keras/_impl/keras/layers/normalization.py
@@ -593,9 +593,9 @@ class BatchNormalization(Layer):
         # used during evaluation, it is more efficient to just update in one
         # step and should not make a significant difference in the result.
         new_mean = math_ops.reduce_mean(new_mean,
-                                        axis=1, keep_dims=True)
+                                        axis=1, keepdims=True)
         new_variance = math_ops.reduce_mean(new_variance,
-                                            axis=1, keep_dims=True)
+                                            axis=1, keepdims=True)
 
       def _do_update(var, value):
         if in_eager_mode and not self.trainable:
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index ebbec39cf3..c03c514699 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -917,6 +917,20 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "string_strip_op_test",
+    size = "small",
+    srcs = ["string_strip_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
 tf_py_test(
     name = "substr_op_test",
     size = "small",
@@ -1195,6 +1209,18 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "broadcast_to_ops_test",
+    size = "small",
+    srcs = ["broadcast_to_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 cuda_py_test(
     name = "inplace_ops_test",
     size = "small",
diff --git a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
new file mode 100644
index 0000000000..6a1bd958ba
--- /dev/null
+++ b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
@@ -0,0 +1,85 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for broadcast_to ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test as test_lib
+
+
+class BroadcastToTest(test_util.TensorFlowTestCase):
+
+  def testBroadcastToBasic(self):
+    for dtype in [np.uint8, np.uint16, np.int8, np.int16, np.int32, np.int64]:
+      with self.test_session(use_gpu=True):
+        x = np.array([1, 2, 3], dtype=dtype)
+        v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
+        v_np = np.broadcast_to(x, [3, 3])
+        self.assertAllEqual(v_tf.eval(), v_np)
+
+  def testBroadcastToString(self):
+    with self.test_session(use_gpu=True):
+      x = np.array([b"1", b"2", b"3"])
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
+      v_np = np.broadcast_to(x, [3, 3])
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  def testBroadcastToBool(self):
+    with self.test_session(use_gpu=True):
+      x = np.array([True, False, True], dtype=np.bool)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
+      v_np = np.broadcast_to(x, [3, 3])
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  def testBroadcastToShape(self):
+    for input_dim in range(1, 6):
+      for output_dim in range(input_dim, 6):
+        with self.test_session(use_gpu=True):
+          input_shape = [2] * input_dim
+          output_shape = [2] * output_dim
+          x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
+          v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
+          v_np = np.broadcast_to(x, output_shape)
+          self.assertAllEqual(v_tf.eval(), v_np)
+
+  def testBroadcastToScalar(self):
+    with self.test_session(use_gpu=True):
+      x = np.array(1, dtype=np.int32)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
+      v_np = np.broadcast_to(x, [3, 3])
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  def testBroadcastToShapeTypeAndInference(self):
+    for dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session(use_gpu=True):
+        x = np.array([1, 2, 3])
+        v_tf = array_ops.broadcast_to(
+            constant_op.constant(x),
+            constant_op.constant([3, 3], dtype=dtype))
+        shape = v_tf.get_shape().as_list()
+        v_np = np.broadcast_to(x, [3, 3])
+        self.assertAllEqual(v_tf.eval(), v_np)
+        # check shape inference when shape input is constant
+        self.assertAllEqual(shape, v_np.shape)
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index 670a625f0f..79e419867d 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -104,11 +105,7 @@ class ConfusionMatrixTest(test.TestCase):
       d, l, cm_out = sess.run([data, lab, cm], {m_neg: 0.0, m_pos: 1.0, s: 1.0})
 
       truth = np.zeros([2, 2], dtype=np_dtype)
-      try:
-        range_builder = xrange
-      except NameError:  # In Python 3.
-        range_builder = range
-      for i in range_builder(len(d)):
+      for i in xrange(len(d)):
         truth[l[i], d[i]] += 1
 
       self.assertEqual(cm_out.dtype, np_dtype)
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 749313b00d..107ee37fab 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -65,6 +65,11 @@ class ConstantTest(test.TestCase):
     self._testCpu(x)
     self._testGpu(x)
 
+  def testInvalidDType(self):
+    # Test case for GitHub issue 18474
+    with self.assertRaises(TypeError):
+      constant_op.constant(dtypes_lib.string, "[,]")
+
   def testBFloat16(self):
     bfloat16 = dtypes_lib.bfloat16.as_numpy_dtype
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(bfloat16))
diff --git a/tensorflow/python/kernel_tests/conv3d_transpose_test.py b/tensorflow/python/kernel_tests/conv3d_transpose_test.py
index a8b3af5096..8973a450fa 100644
--- a/tensorflow/python/kernel_tests/conv3d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv3d_transpose_test.py
@@ -119,6 +119,18 @@ class Conv3DTransposeTest(test.TestCase):
                   target = 3.0
                 self.assertAllClose(target, value[n, d, h, w, k])
 
+  def testConv3DTransposeShapeMismatch(self):
+    # Test case for GitHub issue 18460
+    x_shape = [2, 2, 3, 4, 3]
+    f_shape = [3, 3, 3, 2, 2]
+    y_shape = [2, 2, 6, 8, 6]
+    strides = [1, 1, 2, 2, 2]
+    np.random.seed(1)
+    x_value = np.random.random_sample(x_shape).astype(np.float64)
+    f_value = np.random.random_sample(f_shape).astype(np.float64)
+    nn_ops.conv3d_transpose(
+        x_value, f_value, y_shape, strides, data_format='NCDHW')
+
   def testConv3DTransposeValid(self):
     with self.test_session():
       strides = [1, 2, 2, 2, 1]
diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
index b8200ac0cb..f31426713c 100644
--- a/tensorflow/python/kernel_tests/manip_ops_test.py
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import manip_ops
 from tensorflow.python.platform import test as test_lib
@@ -88,41 +90,78 @@ class RollTest(test_util.TensorFlowTestCase):
         x = np.random.rand(3, 2, 1, 1).astype(t)
         self._testAll(x + 1j * x, [2, 1, 1, 0], [0, 3, 1, 2])
 
+  def testNegativeAxis(self):
+    self._testAll(np.random.randint(-100, 100, (5)).astype(np.int32), 3, -1)
+    self._testAll(np.random.randint(-100, 100, (4, 4)).astype(np.int32), 3, -2)
+    # Make sure negative axis shoudl be 0 <= axis + dims < dims
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "is out of range"):
+        manip_ops.roll(np.random.randint(-100, 100, (4, 4)).astype(np.int32),
+                       3, -10).eval()
+
+  def testInvalidInputShape(self):
+    # The input should be 1-D or higher, checked in shape function.
+    with self.assertRaisesRegexp(
+        ValueError, "Shape must be at least rank 1 but is rank 0"):
+      manip_ops.roll(7, 1, 0)
+
   def testRollInputMustVectorHigherRaises(self):
-    tensor = 7
+    # The input should be 1-D or higher, checked in kernel.
+    tensor = array_ops.placeholder(dtype=dtypes.int32)
     shift = 1
     axis = 0
     with self.test_session():
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "input must be 1-D or higher"):
-        manip_ops.roll(tensor, shift, axis).eval()
+        manip_ops.roll(tensor, shift, axis).eval(feed_dict={tensor: 7})
+
+  def testInvalidAxisShape(self):
+    # The axis should be a scalar or 1-D, checked in shape function.
+    with self.assertRaisesRegexp(
+        ValueError, "Shape must be at most rank 1 but is rank 2"):
+      manip_ops.roll([[1, 2], [3, 4]], 1, [[0, 1]])
 
   def testRollAxisMustBeScalarOrVectorRaises(self):
+    # The axis should be a scalar or 1-D, checked in kernel.
     tensor = [[1, 2], [3, 4]]
     shift = 1
-    axis = [[0, 1]]
+    axis = array_ops.placeholder(dtype=dtypes.int32)
     with self.test_session():
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "axis must be a scalar or a 1-D vector"):
-        manip_ops.roll(tensor, shift, axis).eval()
+        manip_ops.roll(tensor, shift, axis).eval(feed_dict={axis: [[0, 1]]})
+
+  def testInvalidShiftShape(self):
+    # The shift should be a scalar or 1-D, checked in shape function.
+    with self.assertRaisesRegexp(
+        ValueError, "Shape must be at most rank 1 but is rank 2"):
+      manip_ops.roll([[1, 2], [3, 4]], [[0, 1]], 1)
 
   def testRollShiftMustBeScalarOrVectorRaises(self):
+    # The shift should be a scalar or 1-D, checked in kernel.
     tensor = [[1, 2], [3, 4]]
-    shift = [[0, 1]]
+    shift = array_ops.placeholder(dtype=dtypes.int32)
     axis = 1
     with self.test_session():
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "shift must be a scalar or a 1-D vector"):
-        manip_ops.roll(tensor, shift, axis).eval()
+        manip_ops.roll(tensor, shift, axis).eval(feed_dict={shift: [[0, 1]]})
+
+  def testInvalidShiftAndAxisNotEqualShape(self):
+    # The shift and axis must be same size, checked in shape function.
+    with self.assertRaisesRegexp(ValueError, "both shapes must be equal"):
+      manip_ops.roll([[1, 2], [3, 4]], [1], [0, 1])
 
   def testRollShiftAndAxisMustBeSameSizeRaises(self):
+    # The shift and axis must be same size, checked in kernel.
     tensor = [[1, 2], [3, 4]]
-    shift = [1]
+    shift = array_ops.placeholder(dtype=dtypes.int32)
     axis = [0, 1]
     with self.test_session():
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "shift and axis must have the same size"):
-        manip_ops.roll(tensor, shift, axis).eval()
+        manip_ops.roll(tensor, shift, axis).eval(feed_dict={shift: [1]})
 
   def testRollAxisOutOfRangeRaises(self):
     tensor = [1, 2]
diff --git a/tensorflow/python/kernel_tests/norm_op_test.py b/tensorflow/python/kernel_tests/norm_op_test.py
index d85512fae6..3f71b326a2 100644
--- a/tensorflow/python/kernel_tests/norm_op_test.py
+++ b/tensorflow/python/kernel_tests/norm_op_test.py
@@ -37,17 +37,17 @@ class NormOpTest(test_lib.TestCase):
 
   def testBadOrder(self):
     matrix = [[0., 1.], [2., 3.]]
-    for ord_ in "foo", -7, -1.1, 0:
+    for ord_ in "fro", -7, -1.1, 0:
       with self.assertRaisesRegexp(ValueError,
                                    "'ord' must be a supported vector norm"):
-        linalg_ops.norm(matrix, ord="fro")
+        linalg_ops.norm(matrix, ord=ord_)
 
-    for ord_ in "foo", -7, -1.1, 0:
+    for ord_ in "fro", -7, -1.1, 0:
       with self.assertRaisesRegexp(ValueError,
                                    "'ord' must be a supported vector norm"):
         linalg_ops.norm(matrix, ord=ord_, axis=-1)
 
-    for ord_ in 1.1, 2:
+    for ord_ in "foo", -7, -1.1, 1.1:
       with self.assertRaisesRegexp(ValueError,
                                    "'ord' must be a supported matrix norm"):
         linalg_ops.norm(matrix, ord=ord_, axis=[-2, -1])
@@ -69,14 +69,14 @@ def _GetNormOpTest(dtype_, shape_, ord_, axis_, keep_dims_, use_static_shape_):
       if use_static_shape_:
         tf_matrix = constant_op.constant(matrix)
         tf_norm = linalg_ops.norm(
-            tf_matrix, ord=ord_, axis=axis_, keep_dims=keep_dims_)
+            tf_matrix, ord=ord_, axis=axis_, keepdims=keep_dims_)
         tf_norm_val = sess.run(tf_norm)
       else:
         tf_matrix = array_ops.placeholder(dtype_)
         tf_norm = linalg_ops.norm(
-            tf_matrix, ord=ord_, axis=axis_, keep_dims=keep_dims_)
+            tf_matrix, ord=ord_, axis=axis_, keepdims=keep_dims_)
         tf_norm_val = sess.run(tf_norm, feed_dict={tf_matrix: matrix})
-    self.assertAllClose(np_norm, tf_norm_val)
+    self.assertAllClose(np_norm, tf_norm_val, rtol=1e-5, atol=1e-5)
 
   def Test(self):
     is_matrix_norm = (isinstance(axis_, tuple) or
@@ -85,8 +85,6 @@ def _GetNormOpTest(dtype_, shape_, ord_, axis_, keep_dims_, use_static_shape_):
     if ((not is_matrix_norm and ord_ == "fro") or
         (is_matrix_norm and is_fancy_p_norm)):
       self.skipTest("Not supported by neither numpy.linalg.norm nor tf.norm")
-    if is_matrix_norm and ord_ == 2:
-      self.skipTest("Not supported by tf.norm")
     if ord_ == 'euclidean' or (axis_ is None and len(shape) > 2):
       self.skipTest("Not supported by numpy.linalg.norm")
     matrix = np.random.randn(*shape_).astype(dtype_)
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 5b508b7c0e..b9f44d728a 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -52,6 +52,38 @@ class PyFuncTest(test.TestCase):
   """Encapsulates tests for py_func and eager_py_func."""
 
   # ----- Tests for py_func -----
+  def testRealDataTypes(self):
+    def sum_func(x, y):
+      return x + y
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64,
+                  dtypes.uint8, dtypes.int8, dtypes.uint16, dtypes.int16,
+                  dtypes.int32, dtypes.int64]:
+      with self.test_session():
+        x = constant_op.constant(1, dtype=dtype)
+        y = constant_op.constant(2, dtype=dtype)
+        z = self.evaluate(script_ops.py_func(sum_func, [x, y], dtype))
+        self.assertEqual(z, 3)
+
+  def testComplexDataTypes(self):
+    def sub_func(x, y):
+      return x - y
+    for dtype in [dtypes.complex64, dtypes.complex128]:
+      with self.test_session():
+        x = constant_op.constant(1 + 1j, dtype=dtype)
+        y = constant_op.constant(2 - 2j, dtype=dtype)
+        z = self.evaluate(script_ops.py_func(sub_func, [x, y], dtype))
+        self.assertEqual(z, -1 + 3j)
+
+  def testBoolDataTypes(self):
+    def and_func(x, y):
+      return x and y
+    dtype = dtypes.bool
+    with self.test_session():
+      x = constant_op.constant(True, dtype=dtype)
+      y = constant_op.constant(False, dtype=dtype)
+      z = self.evaluate(script_ops.py_func(and_func, [x, y], dtype))
+      self.assertEqual(z, False)
+
   def testSingleType(self):
     with self.test_session():
       x = constant_op.constant(1.0, dtypes.float32)
diff --git a/tensorflow/python/kernel_tests/random/multinomial_op_test.py b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
index a9dc7b7de0..051c7d86bf 100644
--- a/tensorflow/python/kernel_tests/random/multinomial_op_test.py
+++ b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
@@ -46,7 +46,7 @@ def composed_sampler(logits, num_samples):
   logits = array_ops.expand_dims(logits, -1)
 
   # [batch size, num samples]
-  return math_ops.argmax(logits + noise, dimension=1)
+  return math_ops.argmax(logits + noise, axis=1)
 
 
 native_sampler = random_ops.multinomial
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index df37dd98ec..e4b5c3832a 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -228,6 +228,17 @@ class RandomUniformTest(test.TestCase):
         print("count = ", count)
       self.assertTrue(count < count_limit)
 
+  def testUniformIntsWithInvalidShape(self):
+    for dtype in dtypes.int32, dtypes.int64:
+      with self.assertRaisesRegexp(
+          ValueError, "Shape must be rank 0 but is rank 1"):
+        random_ops.random_uniform(
+            [1000], minval=[1, 2], maxval=3, dtype=dtype)
+      with self.assertRaisesRegexp(
+          ValueError, "Shape must be rank 0 but is rank 1"):
+        random_ops.random_uniform(
+            [1000], minval=1, maxval=[2, 3], dtype=dtype)
+
   # Check that uniform ints actually follow a uniform distribution.
   def testUniformInts(self):
     minv = -2
diff --git a/tensorflow/python/kernel_tests/string_strip_op_test.py b/tensorflow/python/kernel_tests/string_strip_op_test.py
new file mode 100644
index 0000000000..30fd477ff4
--- /dev/null
+++ b/tensorflow/python/kernel_tests/string_strip_op_test.py
@@ -0,0 +1,56 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for string_strip_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+class StringStripOpTest(test.TestCase):
+  """ Test cases for tf.string_strip."""
+
+  def test_string_strip(self):
+    strings = ["pigs on the wing", "animals"]
+
+    with self.test_session() as sess:
+      output = string_ops.string_strip(strings)
+      output = sess.run(output)
+      self.assertAllEqual(output, [b"pigs on the wing", b"animals"])
+
+  def test_string_strip_2d(self):
+    strings = [["pigs on the wing", "animals"],
+               [" hello ", "\n\tworld \r \n"]]
+
+    with self.test_session() as sess:
+      output = string_ops.string_strip(strings)
+      output = sess.run(output)
+      self.assertAllEqual(output, [[b"pigs on the wing", b"animals"],
+                                   [b"hello", b"world"]])
+
+  def test_string_strip_with_empty_strings(self):
+    strings = [" hello ", "", "world ", " \t \r \n "]
+
+    with self.test_session() as sess:
+      output = string_ops.string_strip(strings)
+      output = sess.run(output)
+      self.assertAllEqual(output, [b"hello", b"", b"world", b""])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 22317a348c..8c6bb7955a 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -126,6 +126,9 @@ Status NumericNpDTypeToTfDType(const int np, DataType* tf) {
     case NPY_INT8:
       *tf = DT_INT8;
       break;
+    case NPY_UINT16:
+      *tf = DT_UINT16;
+      break;
     case NPY_INT16:
       *tf = DT_INT16;
       break;
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index fa26e07c85..ceeabe090d 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -144,6 +144,7 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
 
 # pylint: disable=redefined-builtin,protected-access
 @tf_export("expand_dims")
+@deprecation.deprecated_args(None, "Use the `axis` argument instead", "dim")
 def expand_dims(input, axis=None, name=None, dim=None):
   """Inserts a dimension of 1 into a tensor's shape.
 
@@ -193,11 +194,7 @@ def expand_dims(input, axis=None, name=None, dim=None):
   Raises:
     ValueError: if both `dim` and `axis` are specified.
   """
-  # TODO(aselle): Remove argument dim
-  if dim is not None:
-    if axis is not None:
-      raise ValueError("can't specify both 'dim' and 'axis'")
-    axis = dim
+  axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
   return gen_array_ops.expand_dims(input, axis, name)
 
 
@@ -2581,6 +2578,8 @@ def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None):
 
 
 @tf_export("squeeze")
+@deprecation.deprecated_args(None, "Use the `axis` argument instead",
+                             "squeeze_dims")
 def squeeze(input, axis=None, name=None, squeeze_dims=None):
   # pylint: disable=redefined-builtin
   """Removes dimensions of size 1 from the shape of a tensor.
@@ -2621,10 +2620,8 @@ def squeeze(input, axis=None, name=None, squeeze_dims=None):
   Raises:
     ValueError: When both `squeeze_dims` and `axis` are specified.
   """
-  if squeeze_dims is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'squeeze_dims' and 'axis'")
-    axis = squeeze_dims
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "squeeze_dims", squeeze_dims)
   if np.isscalar(axis):
     axis = [axis]
   return gen_array_ops.squeeze(input, axis, name)
diff --git a/tensorflow/python/ops/distributions/categorical.py b/tensorflow/python/ops/distributions/categorical.py
index 66fa9e110c..8f25b1149c 100644
--- a/tensorflow/python/ops/distributions/categorical.py
+++ b/tensorflow/python/ops/distributions/categorical.py
@@ -311,7 +311,7 @@ class Categorical(distribution.Distribution):
         nn_ops.log_softmax(self.logits) * self.probs, axis=-1)
 
   def _mode(self):
-    ret = math_ops.argmax(self.logits, dimension=self._batch_rank)
+    ret = math_ops.argmax(self.logits, axis=self._batch_rank)
     ret = math_ops.cast(ret, self.dtype)
     ret.set_shape(self.batch_shape)
     return ret
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index f0120f2957..9e46739bc1 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -331,11 +331,11 @@ def embedding_lookup_sparse(params,
       representing sharded embedding tensors.  Alternatively, a
       `PartitionedVariable`, created by partitioning along dimension 0. Each
       element must be appropriately sized for the given `partition_strategy`.
-    sp_ids: N x M SparseTensor of int64 ids (typically from FeatureValueToId),
+    sp_ids: N x M `SparseTensor` of int64 ids (typically from FeatureValueToId),
       where N is typically batch size and M is arbitrary.
-    sp_weights: either a SparseTensor of float / double weights, or None to
-      indicate all weights should be taken to be 1. If specified, sp_weights
-      must have exactly the same shape and indices as sp_ids.
+    sp_weights: either a `SparseTensor` of float / double weights, or `None` to
+      indicate all weights should be taken to be 1. If specified, `sp_weights`
+      must have exactly the same shape and indices as `sp_ids`.
     partition_strategy: A string specifying the partitioning strategy, relevant
       if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
       is `"mod"`. See `tf.nn.embedding_lookup` for more details.
@@ -351,39 +351,43 @@ def embedding_lookup_sparse(params,
 
   Returns:
     A dense tensor representing the combined embeddings for the
-    sparse ids. For each row in the dense tensor represented by sp_ids, the op
+    sparse ids. For each row in the dense tensor represented by `sp_ids`, the op
     looks up the embeddings for all ids in that row, multiplies them by the
     corresponding weight, and combines these embeddings as specified.
 
     In other words, if
 
-      shape(combined params) = [p0, p1, ..., pm]
+      `shape(combined params) = [p0, p1, ..., pm]`
 
     and
 
-      shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]
+      `shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]`
 
     then
 
-      shape(output) = [d0, d1, ..., dn-1, p1, ..., pm].
+      `shape(output) = [d0, d1, ..., dn-1, p1, ..., pm]`.
 
     For instance, if params is a 10x20 matrix, and sp_ids / sp_weights are
 
+      ```python
       [0, 0]: id 1, weight 2.0
       [0, 1]: id 3, weight 0.5
       [1, 0]: id 0, weight 1.0
       [2, 3]: id 1, weight 3.0
+      ```
 
     with `combiner`="mean", then the output will be a 3x20 matrix where
 
+      ```python
       output[0, :] = (params[1, :] * 2.0 + params[3, :] * 0.5) / (2.0 + 0.5)
       output[1, :] = (params[0, :] * 1.0) / 1.0
       output[2, :] = (params[1, :] * 3.0) / 3.0
+      ```
 
   Raises:
-    TypeError: If sp_ids is not a SparseTensor, or if sp_weights is neither
-      None nor SparseTensor.
-    ValueError: If combiner is not one of {"mean", "sqrtn", "sum"}.
+    TypeError: If `sp_ids` is not a `SparseTensor`, or if `sp_weights` is 
+      neither `None` nor `SparseTensor`.
+    ValueError: If `combiner` is not one of {"mean", "sqrtn", "sum"}.
   """
   if combiner is None:
     logging.warn("The default value of combiner will change from \"mean\" "
diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index 4a1ef54fb5..ec38d89a0e 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -32,7 +32,6 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
-from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('histogram_fixed_width_bins')
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 3369fe3c9b..601010bce9 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -269,17 +269,7 @@ def random_flip_up_down(image, seed=None):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'random_flip_up_down', [image]) as scope:
-    image = ops.convert_to_tensor(image, name='image')
-    image = _Assert3DImage(image)
-    uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-    mirror_cond = math_ops.less(uniform_random, .5)
-    result = control_flow_ops.cond(
-        mirror_cond,
-        lambda: array_ops.reverse(image, [0]),
-        lambda: image,
-        name=scope)
-    return fix_image_flip_shape(image, result)
+  return _random_flip(image, 0, seed, 'random_flip_up_down')
 
 
 @tf_export('image.random_flip_left_right')
@@ -301,14 +291,34 @@ def random_flip_left_right(image, seed=None):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'random_flip_left_right', [image]) as scope:
+  return _random_flip(image, 1, seed, 'random_flip_left_right')
+
+
+def _random_flip(image, flip_index, seed, scope_name):
+  """Randomly (50% chance) flip an image along axis `flip_index`.
+    Args:
+      image: A 3-D tensor of shape `[height, width, channels].`
+      flip_index: The dimension along which to flip the image.
+                  Vertical: 0, Horizontal: 1
+      seed: A Python integer. Used to create a random seed. See
+        @{tf.set_random_seed}
+        for behavior.
+      scope_name: Name of the scope in which the ops are added.
+
+    Returns:
+      A 3-D tensor of the same type and shape as `image`.
+
+    Raises:
+      ValueError: if the shape of `image` not supported.
+  """
+  with ops.name_scope(None, scope_name, [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
     image = _Assert3DImage(image)
     uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
     mirror_cond = math_ops.less(uniform_random, .5)
     result = control_flow_ops.cond(
         mirror_cond,
-        lambda: array_ops.reverse(image, [1]),
+        lambda: array_ops.reverse(image, [flip_index]),
         lambda: image,
         name=scope)
     return fix_image_flip_shape(image, result)
@@ -332,16 +342,7 @@ def flip_left_right(image):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'flip_left_right', [image]):
-    image = ops.convert_to_tensor(image, name='image')
-    image = _AssertAtLeast3DImage(image)
-    shape = image.get_shape()
-    if shape.ndims == 3 or shape.ndims is None:
-      return fix_image_flip_shape(image, array_ops.reverse(image, [1]))
-    elif shape.ndims == 4:
-      return array_ops.reverse(image, [2])
-    else:
-      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+  return _flip(image, 1, 'flip_left_right')
 
 
 @tf_export('image.flip_up_down')
@@ -362,14 +363,35 @@ def flip_up_down(image):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'flip_up_down', [image]):
+  return _flip(image, 0, 'flip_up_down')
+
+
+def _flip(image, flip_index, scope_name):
+  """Flip an image either horizontally or vertically.
+
+  Outputs the contents of `image` flipped along the dimension `flip_index`.
+
+  See also `reverse()`.
+
+  Args:
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
+    flip_index: 0 For vertical, 1 for horizontal.
+
+  Returns:
+    A tensor of the same type and shape as `image`.
+
+  Raises:
+    ValueError: if the shape of `image` not supported.
+  """
+  with ops.name_scope(None, scope_name, [image]):
     image = ops.convert_to_tensor(image, name='image')
     image = _AssertAtLeast3DImage(image)
     shape = image.get_shape()
     if shape.ndims == 3 or shape.ndims is None:
-      return fix_image_flip_shape(image, array_ops.reverse(image, [0]))
+      return fix_image_flip_shape(image, array_ops.reverse(image, [flip_index]))
     elif shape.ndims == 4:
-      return array_ops.reverse(image, [1])
+      return array_ops.reverse(image, [flip_index+1])
     else:
       raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 39b7295124..f93bf0a17f 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -39,10 +39,10 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import linalg_ops_impl
+from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
@@ -529,7 +529,7 @@ class Orthogonal(Initializer):
     # Generate a random matrix
     a = random_ops.random_normal(flat_shape, dtype=dtype, seed=self.seed)
     # Compute the qr factorization
-    q, r = linalg_ops.qr(a, full_matrices=False)
+    q, r = gen_linalg_ops.qr(a, full_matrices=False)
     # Make Q uniform
     d = array_ops.diag_part(r)
     q *= math_ops.sign(d)
@@ -577,7 +577,7 @@ class ConvolutionDeltaOrthogonal(Initializer):
     a = random_ops.random_normal([shape[-1], shape[-1]],
                                  dtype=dtype, seed=self.seed)
     # Compute the qr factorization
-    q, r = linalg_ops.qr(a, full_matrices=False)
+    q, r = gen_linalg_ops.qr(a, full_matrices=False)
     # Make Q uniform
     d = array_ops.diag_part(r)
     q *= math_ops.sign(d)
@@ -636,7 +636,7 @@ class ConvolutionOrthogonal(Initializer):
     a = random_ops.random_normal([n, n], dtype=self.dtype, seed=self.seed)
     if self.seed:
       self.seed += 1
-    q, r = linalg_ops.qr(a)
+    q, r = gen_linalg_ops.qr(a)
     d = array_ops.diag_part(r)
     # make q uniform
     q *= math_ops.sign(d)
@@ -723,7 +723,7 @@ class ConvolutionOrthogonal2D(ConvolutionOrthogonal):
       raise ValueError("The dimension of the matrices must be the same.")
     n = p1.shape.as_list()[0]
     kernel2x2 = {}
-    eye = linalg_ops.eye(n, dtype=self.dtype)
+    eye = linalg_ops_impl.eye(n, dtype=self.dtype)
     kernel2x2[0, 0] = math_ops.matmul(p1, p2)
     kernel2x2[0, 1] = math_ops.matmul(p1, (eye - p2))
     kernel2x2[1, 0] = math_ops.matmul((eye - p1), p2)
@@ -848,7 +848,7 @@ class ConvolutionOrthogonal1D(ConvolutionOrthogonal):
     """
     n = projection_matrix.shape.as_list()[0]
     kernel = {}
-    eye = linalg_ops.eye(n, dtype=self.dtype)
+    eye = linalg_ops_impl.eye(n, dtype=self.dtype)
     kernel[0] = projection_matrix
     kernel[1] = eye - projection_matrix
     return kernel
@@ -976,7 +976,7 @@ class ConvolutionOrthogonal3D(ConvolutionOrthogonal):
     if p1_shape != p2.shape.as_list() or p1_shape != p3.shape.as_list():
       raise ValueError("The dimension of the matrices must be the same.")
     n = p1_shape[0]
-    eye = linalg_ops.eye(n, dtype=self.dtype)
+    eye = linalg_ops_impl.eye(n, dtype=self.dtype)
     kernel2x2x2 = {}
     def matmul(p1, p2, p3):
       return math_ops.matmul(math_ops.matmul(p1, p2), p3)
@@ -1084,7 +1084,7 @@ class Identity(Initializer):
           "Identity matrix initializer can only be used for 2D matrices.")
     if dtype is None:
       dtype = self.dtype
-    initializer = linalg_ops.eye(*full_shape, dtype=dtype)
+    initializer = linalg_ops_impl.eye(*full_shape, dtype=dtype)
     if partition_info is not None:
       initializer = array_ops.slice(initializer, partition_info.var_offset,
                                     shape)
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index 170861b43f..a0dfa543f9 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -24,12 +24,13 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_linalg_ops
+from tensorflow.python.ops import linalg_ops_impl
 from tensorflow.python.ops import math_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_linalg_ops import *
 # pylint: enable=wildcard-import
-from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
@@ -159,36 +160,11 @@ def eye(num_rows,
   Returns:
     A `Tensor` of shape `batch_shape + [num_rows, num_columns]`
   """
-  with ops.name_scope(
-      name, default_name='eye', values=[num_rows, num_columns, batch_shape]):
-    is_square = num_columns is None
-    batch_shape = [] if batch_shape is None else batch_shape
-    num_columns = num_rows if num_columns is None else num_columns
-    if isinstance(num_rows, ops.Tensor) or isinstance(
-        num_columns, ops.Tensor) or isinstance(batch_shape, ops.Tensor):
-      batch_shape = ops.convert_to_tensor(
-          batch_shape, name='shape', dtype=dtypes.int32)
-      diag_size = math_ops.minimum(num_rows, num_columns)
-      diag_shape = array_ops.concat((batch_shape, [diag_size]), 0)
-      if not is_square:
-        shape = array_ops.concat((batch_shape, [num_rows, num_columns]), 0)
-    else:
-      if not isinstance(num_rows, compat.integral_types) or not isinstance(
-          num_columns, compat.integral_types):
-        raise TypeError(
-            'num_rows and num_columns must be positive integer values.')
-      batch_shape = [dim for dim in batch_shape]
-      is_square = num_rows == num_columns
-      diag_shape = batch_shape + [np.minimum(num_rows, num_columns)]
-      if not is_square:
-        shape = batch_shape + [num_rows, num_columns]
-
-    diag_ones = array_ops.ones(diag_shape, dtype=dtype)
-    if is_square:
-      return array_ops.matrix_diag(diag_ones)
-    else:
-      zero_matrix = array_ops.zeros(shape, dtype=dtype)
-      return array_ops.matrix_set_diag(zero_matrix, diag_ones)
+  return linalg_ops_impl.eye(num_rows,
+                             num_columns=num_columns,
+                             batch_shape=batch_shape,
+                             dtype=dtype,
+                             name=name)
 
 
 @tf_export('matrix_solve_ls', 'linalg.lstsq')
@@ -454,7 +430,7 @@ def norm(tensor,
 
   This function can compute several different vector norms (the 1-norm, the
   Euclidean or 2-norm, the inf-norm, and in general the p-norm for p > 0) and
-  matrix norms (Frobenius, 1-norm, and inf-norm).
+  matrix norms (Frobenius, 1-norm, 2-norm and inf-norm).
 
   Args:
     tensor: `Tensor` of types `float32`, `float64`, `complex64`, `complex128`
@@ -465,7 +441,7 @@ def norm(tensor,
       Some restrictions apply:
         a) The Frobenius norm `fro` is not defined for vectors,
         b) If axis is a 2-tuple (matrix norm), only 'euclidean', 'fro', `1`,
-           `np.inf` are supported.
+           `2`, `np.inf` are supported.
       See the description of `axis` on how to compute norms for a batch of
       vectors or matrices stored in a tensor.
     axis: If `axis` is `None` (the default), the input is considered a vector
@@ -521,8 +497,7 @@ def norm(tensor,
         axis[0] == axis[1]):
       raise ValueError(
           "'axis' must be None, an integer, or a tuple of 2 unique integers")
-    # TODO(rmlarsen): Implement matrix 2-norm using tf.svd().
-    supported_matrix_norms = ['euclidean', 'fro', 1, np.inf]
+    supported_matrix_norms = ['euclidean', 'fro', 1, 2, np.inf]
     if ord not in supported_matrix_norms:
       raise ValueError("'ord' must be a supported matrix norm in %s, got %s" %
                        (supported_matrix_norms, ord))
@@ -539,12 +514,34 @@ def norm(tensor,
 
   with ops.name_scope(name, 'norm', [tensor]):
     tensor = ops.convert_to_tensor(tensor)
+
     if ord in ['fro', 'euclidean', 2, 2.0]:
-      # TODO(rmlarsen): Move 2-norm to a separate clause once we support it for
-      # matrices.
-      result = math_ops.sqrt(
-          math_ops.reduce_sum(
-              tensor * math_ops.conj(tensor), axis, keepdims=True))
+      if is_matrix_norm and ord in [2, 2.0]:
+        rank = array_ops.rank(tensor)
+        positive_axis = functional_ops.map_fn(
+            lambda i: control_flow_ops.cond(i >= 0, lambda: i, lambda: i + rank),
+            ops.convert_to_tensor(axis))
+        axes = math_ops.range(rank)
+        perm_before = array_ops.concat(
+            [array_ops.setdiff1d(axes, positive_axis)[0], positive_axis],
+            axis=0)
+        perm_after = functional_ops.map_fn(
+            lambda i: math_ops.cast(
+                array_ops.squeeze(
+                    array_ops.where(math_ops.equal(perm_before, i))),
+                dtype=dtypes.int32), axes)
+        permed = array_ops.transpose(tensor, perm=perm_before)
+        matrix_2_norm = array_ops.expand_dims(
+            math_ops.reduce_max(
+                math_ops.abs(gen_linalg_ops.svd(permed, compute_uv=False)[0]),
+                axis=-1,
+                keepdims=True),
+            axis=-1)
+        result = array_ops.transpose(matrix_2_norm, perm=perm_after)
+      else:
+        result = math_ops.sqrt(
+            math_ops.reduce_sum(
+                tensor * math_ops.conj(tensor), axis, keepdims=True))
     else:
       result = math_ops.abs(tensor)
       if ord == 1:
diff --git a/tensorflow/python/ops/linalg_ops_impl.py b/tensorflow/python/ops/linalg_ops_impl.py
new file mode 100644
index 0000000000..e7c89f6ae3
--- /dev/null
+++ b/tensorflow/python/ops/linalg_ops_impl.py
@@ -0,0 +1,73 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operations for linear algebra."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.util import compat
+
+# Names below are lower_case.
+# pylint: disable=invalid-name
+
+
+def eye(num_rows,
+        num_columns=None,
+        batch_shape=None,
+        dtype=dtypes.float32,
+        name=None):
+  """Construct an identity matrix, or a batch of matrices.
+
+  See `linalg_ops.eye`.
+  """
+  with ops.name_scope(
+      name, default_name='eye', values=[num_rows, num_columns, batch_shape]):
+    is_square = num_columns is None
+    batch_shape = [] if batch_shape is None else batch_shape
+    num_columns = num_rows if num_columns is None else num_columns
+    if isinstance(num_rows, ops.Tensor) or isinstance(
+        num_columns, ops.Tensor) or isinstance(batch_shape, ops.Tensor):
+      batch_shape = ops.convert_to_tensor(
+          batch_shape, name='shape', dtype=dtypes.int32)
+      diag_size = math_ops.minimum(num_rows, num_columns)
+      diag_shape = array_ops.concat((batch_shape, [diag_size]), 0)
+      if not is_square:
+        shape = array_ops.concat((batch_shape, [num_rows, num_columns]), 0)
+    else:
+      if not isinstance(num_rows, compat.integral_types) or not isinstance(
+          num_columns, compat.integral_types):
+        raise TypeError(
+            'num_rows and num_columns must be positive integer values.')
+      batch_shape = [dim for dim in batch_shape]
+      is_square = num_rows == num_columns
+      diag_shape = batch_shape + [np.minimum(num_rows, num_columns)]
+      if not is_square:
+        shape = batch_shape + [num_rows, num_columns]
+
+    diag_ones = array_ops.ones(diag_shape, dtype=dtype)
+    if is_square:
+      return array_ops.matrix_diag(diag_ones)
+    else:
+      zero_matrix = array_ops.zeros(shape, dtype=dtype)
+      return array_ops.matrix_set_diag(zero_matrix, diag_ones)
+
+# pylint: enable=invalid-name,redefined-builtin
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 34ca1adc3e..9fc545c967 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util
 from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -306,11 +307,8 @@ def cosine_distance(
     ValueError: If `predictions` shape doesn't match `labels` shape, or
       `axis`, `labels`, `predictions` or `weights` is `None`.
   """
-  if dim is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'axis' and 'dim'")
-    axis = dim
-  if axis is None and dim is None:
+  axis = deprecated_argument_lookup("axis", axis, "dim", dim)
+  if axis is None:
     raise ValueError("You must specify 'axis'.")
   if labels is None:
     raise ValueError("labels must not be None.")
@@ -696,7 +694,7 @@ def softmax_cross_entropy(
     onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
     reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
-  """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits.
+  """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits_v2.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided,
   then the loss is simply scaled by the given value. If `weights` is a
@@ -707,11 +705,16 @@ def softmax_cross_entropy(
       new_onehot_labels = onehot_labels * (1 - label_smoothing)
                           + label_smoothing / num_classes
 
+  Note that `onehot_labels` and `logits` must have the same shape,
+  e.g. `[batch_size, num_classes]`. The shape of `weights` must be
+  broadcastable to loss, whose shape is decided by the shape of `logits`.
+  In case the shape of `logits` is `[batch_size, num_classes]`, loss is
+  a `Tensor` of shape `[batch_size]`.
+
   Args:
-    onehot_labels: `[batch_size, num_classes]` target one-hot-encoded labels.
-    logits: `[batch_size, num_classes]` logits outputs of the network .
-    weights: Optional `Tensor` whose rank is either 0, or rank 1 and is
-      broadcastable to the loss which is a `Tensor` of shape `[batch_size]`.
+    onehot_labels: One-hot-encoded labels.
+    logits: Logits outputs of the network.
+    weights: Optional `Tensor` that is broadcastable to loss.
     label_smoothing: If greater than 0 then smooth the labels.
     scope: the scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 2b04866fef..2feb88cb7b 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -211,11 +211,9 @@ def argmax(input,
            name=None,
            dimension=None,
            output_type=dtypes.int64):
-  if dimension is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'axis' and 'dimension'")
-    axis = dimension
-  elif axis is None:
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "dimension", dimension)
+  if axis is None:
     axis = 0
   return gen_math_ops.arg_max(input, axis, name=name, output_type=output_type)
 
@@ -231,11 +229,9 @@ def argmin(input,
            name=None,
            dimension=None,
            output_type=dtypes.int64):
-  if dimension is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'axis' and 'dimension'")
-    axis = dimension
-  elif axis is None:
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "dimension", dimension)
+  if axis is None:
     axis = 0
   return gen_math_ops.arg_min(input, axis, name=name, output_type=output_type)
 
@@ -761,13 +757,25 @@ def cast(x, dtype, name=None):
   tf.cast(x, tf.int32)  # [1, 2], dtype=tf.int32
   ```
 
+  The operation supports data types (for `x` and `dtype`) of
+  `uint8`, `int8`, `uint16`, `int16`, `int32`, `int64`, `float16`, `float32`,
+  `float64`, `complex64`, `complex128`, `bfloat16`. In case of casting from
+  complex types (`complex64`, `complex128`) to real types, only the real part
+  of `x` is returned. In case of casting from real types to complex types
+  (`complex64`, `complex128`), the imaginary part of the returned value is set
+  to `0`. The handling of complex types here matches the behavior of numpy.
+
   Args:
-    x: A `Tensor` or `SparseTensor`.
-    dtype: The destination type.
+    x: A `Tensor` or `SparseTensor` of numeric type. It could be
+      `uint8`, `int8`, `uint16`, `int16`, `int32`, `int64`,
+      `float16`, `float32`, `float64`, `complex64`, `complex128`, `bfloat16`.
+    dtype: The destination type. The list of supported dtypes is the same
+      as `x`.
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor` or `SparseTensor` with same shape as `x`.
+    A `Tensor` or `SparseTensor` with same shape as `x` and
+      same type as `dtype`.
 
   Raises:
     TypeError: If `x` cannot be cast to the `dtype`.
@@ -1634,7 +1642,7 @@ def reduce_min(input_tensor,
   tensor with a single element is returned.
 
   Args:
-    input_tensor: The tensor to reduce. Should have numeric type.
+    input_tensor: The tensor to reduce. Should have real numeric type.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
@@ -1683,7 +1691,7 @@ def reduce_max(input_tensor,
   tensor with a single element is returned.
 
   Args:
-    input_tensor: The tensor to reduce. Should have numeric type.
+    input_tensor: The tensor to reduce. Should have real numeric type.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index 244702d13b..1d0d9a52a1 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -98,6 +98,7 @@ See the @{$python/nn} guide.
 @@fixed_unigram_candidate_sampler
 @@compute_accidental_hits
 @@quantized_conv2d
+@@quantized_relu
 @@quantized_relu_x
 @@quantized_max_pool
 @@quantized_avg_pool
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 47cc4da7f2..d0d5ed07ce 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -987,7 +987,7 @@ def _compute_sampled_logits(weights,
         class biases.
     labels: A `Tensor` of type `int64` and shape `[batch_size,
         num_true]`. The target classes.  Note that this format differs from
-        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
+        the `labels` argument of `nn.softmax_cross_entropy_with_logits_v2`.
     inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
         activations of the input network.
     num_sampled: An `int`.  The number of classes to randomly sample per batch.
@@ -1012,7 +1012,7 @@ def _compute_sampled_logits(weights,
     out_logits: `Tensor` object with shape
         `[batch_size, num_true + num_sampled]`, for passing to either
         `nn.sigmoid_cross_entropy_with_logits` (NCE) or
-        `nn.softmax_cross_entropy_with_logits` (sampled softmax).
+        `nn.softmax_cross_entropy_with_logits_v2` (sampled softmax).
     out_labels: A Tensor object with the same shape as `out_logits`.
   """
 
@@ -1285,7 +1285,7 @@ def sampled_softmax_loss(weights,
     logits = tf.matmul(inputs, tf.transpose(weights))
     logits = tf.nn.bias_add(logits, biases)
     labels_one_hot = tf.one_hot(labels, n_classes)
-    loss = tf.nn.softmax_cross_entropy_with_logits(
+    loss = tf.nn.softmax_cross_entropy_with_logits_v2(
         labels=labels_one_hot,
         logits=logits)
   ```
@@ -1303,7 +1303,7 @@ def sampled_softmax_loss(weights,
     biases: A `Tensor` of shape `[num_classes]`.  The class biases.
     labels: A `Tensor` of type `int64` and shape `[batch_size,
         num_true]`. The target classes.  Note that this format differs from
-        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
+        the `labels` argument of `nn.softmax_cross_entropy_with_logits_v2`.
     inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
         activations of the input network.
     num_sampled: An `int`.  The number of classes to randomly sample per batch.
@@ -1340,7 +1340,8 @@ def sampled_softmax_loss(weights,
       partition_strategy=partition_strategy,
       name=name,
       seed=seed)
-  sampled_losses = nn_ops.softmax_cross_entropy_with_logits(
+  labels = array_ops.stop_gradient(labels, name="labels_stop_gradient")
+  sampled_losses = nn_ops.softmax_cross_entropy_with_logits_v2(
       labels=labels, logits=logits)
   # sampled_losses is a [batch_size] tensor.
   return sampled_losses
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index bb454b3c3a..cd07550d2e 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1155,7 +1155,7 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
 
   Returns:
     A `Tensor` with the same type as `value`.
-    Output shape with `'VALID`` padding is:
+    Output shape with `'VALID'` padding is:
 
         [batch, height - 2 * (filter_width - 1),
          width - 2 * (filter_height - 1), out_channels].
@@ -1458,10 +1458,10 @@ def conv3d_transpose(
 
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [5] if reached this point.
-      if not filter.get_shape()[3].is_compatible_with(output_shape[4]):
+      if not filter.get_shape()[3].is_compatible_with(output_shape[axis]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[4],
+            "{} != {}".format(output_shape[axis],
                               filter.get_shape()[3]))
 
     if padding != "VALID" and padding != "SAME":
@@ -1986,7 +1986,7 @@ def sparse_softmax_cross_entropy_with_logits(
   must provide a single specific index for the true class for each row of
   `logits` (each minibatch entry).  For soft softmax classification with
   a probability distribution for each entry, see
-  `softmax_cross_entropy_with_logits`.
+  `softmax_cross_entropy_with_logits_v2`.
 
   **WARNING:** This op expects unscaled logits, since it performs a `softmax`
   on `logits` internally for efficiency.  Do not call this op with the
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 9251e9802c..86dc053c0f 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -617,9 +617,9 @@ class BasicLSTMCell(LayerRNNCell):
     Args:
       inputs: `2-D` tensor with shape `[batch_size, input_size]`.
       state: An `LSTMStateTuple` of state tensors, each shaped
-        `[batch_size, self.state_size]`, if `state_is_tuple` has been set to
+        `[batch_size, num_units]`, if `state_is_tuple` has been set to
         `True`.  Otherwise, a `Tensor` shaped
-        `[batch_size, 2 * self.state_size]`.
+        `[batch_size, 2 * num_units]`.
 
     Returns:
       A pair containing the new hidden state, and the new state (either a
diff --git a/tensorflow/python/profiler/tfprof_logger_test.py b/tensorflow/python/profiler/tfprof_logger_test.py
index 141144f987..caf3869f56 100644
--- a/tensorflow/python/profiler/tfprof_logger_test.py
+++ b/tensorflow/python/profiler/tfprof_logger_test.py
@@ -38,7 +38,7 @@ class TFProfLoggerTest(test.TestCase):
     return math_ops.matmul(a, b)
 
   # pylint: disable=pointless-string-statement
-  """# TODO(xpan): This this out of core so it doesn't depend on contrib.
+  """# TODO(xpan): This out of core so it doesn't depend on contrib.
   def testFillMissingShape(self):
     a, b, y = self._BuildSmallPlaceholderlModel()
     run_options = config_pb2.RunOptions(
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index b88be4ae04..73ea85ab0c 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -41,6 +41,7 @@ from tensorflow.python.debug.wrappers import local_cli_wrapper
 from tensorflow.python.framework import meta_graph as meta_graph_lib
 from tensorflow.python.framework import ops as ops_lib
 from tensorflow.python.platform import app  # pylint: disable=unused-import
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.saved_model import loader
 from tensorflow.python.tools import saved_model_utils
 
@@ -543,7 +544,7 @@ def load_inputs_from_input_arg_string(inputs_str, input_exprs_str,
   input_examples = preprocess_input_examples_arg_string(input_examples_str)
 
   for input_tensor_key, (filename, variable_name) in inputs.items():
-    data = np.load(filename)
+    data = np.load(file_io.FileIO(filename, mode='r'))
 
     # When a variable_name key is specified for the input file
     if variable_name:
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 3867c0d8da..70495291bc 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -2731,7 +2731,7 @@ class ScopedGraphTest(test.TestCase):
       # The rest of the variables.
       rest_variables = list(
           set(variables.global_variables()) - set(var_list.keys()))
-      init_rest_op = variables.initialize_variables(rest_variables)
+      init_rest_op = variables.variables_initializer(rest_variables)
 
     with self.test_session(graph=graph) as sess:
       saver = saver_module.Saver(var_list=var_list, max_to_keep=1)
diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 4163fcac79..3358ffe526 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -42,10 +42,8 @@ import six as _six
 
 from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
-from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('compat.as_bytes', 'compat.as_str')
 def as_bytes(bytes_or_text, encoding='utf-8'):
   """Converts either bytes or unicode to `bytes`, using utf-8 encoding for text.
 
@@ -68,7 +66,6 @@ def as_bytes(bytes_or_text, encoding='utf-8'):
                     (bytes_or_text,))
 
 
-@tf_export('compat.as_text')
 def as_text(bytes_or_text, encoding='utf-8'):
   """Returns the given argument as a unicode string.
 
@@ -93,8 +90,12 @@ def as_text(bytes_or_text, encoding='utf-8'):
 # Convert an object to a `str` in both Python 2 and 3.
 if _six.PY2:
   as_str = as_bytes
+  tf_export('compat.as_bytes', 'compat.as_str')(as_bytes)
+  tf_export('compat.as_text')(as_text)
 else:
   as_str = as_text
+  tf_export('compat.as_bytes')(as_bytes)
+  tf_export('compat.as_text', 'compat.as_str')(as_text)
 
 
 @tf_export('compat.as_str_any')
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 640f270323..102419a264 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -524,11 +524,12 @@ port::Status CudnnSupport::Init() {
                                    ToString(status))};
 }
 
-port::StatusOr<std::tuple<int, int, int>> CudnnSupport::GetVersion() {
+port::StatusOr<perftools::gputools::dnn::VersionInfo>
+CudnnSupport::GetVersion() {
   CudnnVersion version;
   TF_RETURN_IF_ERROR(GetLoadedCudnnVersion(&version));
-  return std::make_tuple(version.major_version, version.minor_version,
-                         version.patch_level);
+  return perftools::gputools::dnn::VersionInfo(
+      version.major_version, version.minor_version, version.patch_level);
 }
 
 // Turns a BatchDescriptor structure into a cudnn tensor handle within a scope.
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index e6d12bfef9..5ded7cf154 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -45,7 +45,7 @@ class CudnnSupport : public dnn::DnnSupport {
   ~CudnnSupport() override;
 
   port::Status Init() override;
-  port::StatusOr<std::tuple<int, int, int>> GetVersion() override;
+  port::StatusOr<perftools::gputools::dnn::VersionInfo> GetVersion() override;
 
   port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
       int num_layers, int hidden_size, int input_size,
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index fedf4f53b8..71cab145b9 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -37,14 +37,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/lib/inlined_vector.h"
 
-#if defined(PLATFORM_WINDOWS)
-// TODO: in windows ARRAYSIZE is defined in winnt.h but including it
-//  here creates a conflict with cuda.h - for now define it here.
-#define ARRAYSIZE(a) \
-  ((sizeof(a) / sizeof(*(a))) / \
-  static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
-#endif
-
 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
 bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
 bool FLAGS_gpuexec_cuda_device_0_only = false;
@@ -719,15 +711,15 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
         port::bit_cast<void *>(uintptr_t(info_log_buffer_bytes)),
         port::bit_cast<void *>(info_log_buffer.data()),
         port::bit_cast<void *>(uintptr_t(log_verbose))};
-    CHECK(ARRAYSIZE(options) == ARRAYSIZE(option_values));
+    CHECK(TF_ARRAYSIZE(options) == TF_ARRAYSIZE(option_values));
 
     CUresult res;
     {
       // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
       // module loading: see http://b/13248943
 
-      res = cuModuleLoadDataEx(module, ptx_data, ARRAYSIZE(options), options,
-                               option_values);
+      res = cuModuleLoadDataEx(module, ptx_data, TF_ARRAYSIZE(options),
+                               options, option_values);
     }
 
     // The PTX JIT mutates the values in the option values array to reflect the
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 9700daca89..7c87d33d21 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -1126,7 +1126,7 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
     builder.set_name(device_name);
   }
 
-  for (size_t i = 0; i < ARRAYSIZE(kAllUnqueryableDeviceParams); i++) {
+  for (size_t i = 0; i < TF_ARRAYSIZE(kAllUnqueryableDeviceParams); i++) {
     const auto &params = kAllUnqueryableDeviceParams[i];
     if (params.cc_major == cc_major_ && params.cc_minor == cc_minor_) {
       builder.set_blocks_per_core_limit(params.blocks_per_core_limit);
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 8e202d115a..39f21d8b10 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -875,6 +875,22 @@ enum class ElementwiseOperation { kAdd, kMultiply };
 
 string ElementwiseOperationString(ElementwiseOperation op);
 
+// A simple class representing the version of the backing library, to 
+// workaround the "too perfect forwarding" issue in gcc6+ compilers. 
+// See PR#16309 and issue #18402 for links discussing the issue.
+class VersionInfo {
+ public:
+  VersionInfo(int major = 0, int minor = 0, int patch = 0)
+      : major_(major), minor_(minor), patch_(patch) {}
+  int major_version() { return major_; }
+  int minor_version() { return minor_; }
+  int patch() { return patch_; }
+ private:
+  int major_;
+  int minor_;
+  int patch_;
+};
+
 // Suite of operations typically used for implementing Deep/Convolutional Neural
 // Nets. Note: A false return value of an operation indicates the
 // implementation is not available.
@@ -885,8 +901,8 @@ class DnnSupport {
 
   virtual port::Status Init() = 0;
 
-  // Gets the version of the backing library, as a {major, minor, patch} tuple.
-  virtual port::StatusOr<std::tuple<int, int, int>> GetVersion() {
+  // Gets the version of the backing library, as a VersionInfo object.
+  virtual port::StatusOr<VersionInfo> GetVersion() {
     return port::UnimplementedError(
         "DnnSupport::GetVersion not implemented on this platform.");
   }
diff --git a/tensorflow/stream_executor/platform/port.h b/tensorflow/stream_executor/platform/port.h
index 259cf380d6..57ad965ef1 100644
--- a/tensorflow/stream_executor/platform/port.h
+++ b/tensorflow/stream_executor/platform/port.h
@@ -38,12 +38,6 @@ using tensorflow::uint64;
 using std::string;
 #endif
 
-#if !defined(COMPILER_MSVC)
-#define ARRAYSIZE(a)              \
-    ((sizeof(a) / sizeof(*(a))) / \
-    static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
-#endif
-
 using tensorflow::LinkerInitialized;
 using tensorflow::LINKER_INITIALIZED;
 
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 528f811b40..51e856bed0 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -163,7 +163,6 @@ def if_override_eigen_strong_inline(a):
 
 def get_win_copts(is_external=False):
     WINDOWS_COPTS = [
-        "/D__VERSION__=\\\"MSVC\\\"",
         "/DPLATFORM_WINDOWS",
         "/DEIGEN_HAS_C99_MATH",
         "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
@@ -1704,7 +1703,7 @@ def tf_version_info_genrule():
       ],
       outs=["util/version_info.cc"],
       cmd=
-      "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\"",
+      "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\" --git_tag_override=$${GIT_TAG_OVERRIDE:-}",
       local=1,
       tools=[clean_dep("//tensorflow/tools/git:gen_git_source.py")],)
 
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
index 05e603efb7..c8da55d802 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
@@ -6,6 +6,10 @@ tf_class {
     name: "cluster_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "device_fn"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "evaluation_master"
     mtype: "<type \'property\'>"
@@ -84,7 +88,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\', \'device_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\', \'None\'], "
   }
   member_method {
     name: "replace"
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index c66249999f..0b12bc060e 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -1980,6 +1980,10 @@ tf_module {
     name: "string_split"
     argspec: "args=[\'source\', \'delimiter\', \'skip_empty\'], varargs=None, keywords=None, defaults=[\' \', \'True\'], "
   }
+  member_method {
+    name: "string_strip"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "string_to_hash_bucket"
     argspec: "args=[\'string_tensor\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 82042b93c0..5fa75e1d61 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -123,6 +123,10 @@ done
 
 BAZEL_FLAGS=$(str_strip "${BAZEL_FLAGS}")
 
+if [[ -z "$GIT_TAG_OVERRIDE" ]]; then
+  BAZEL_FLAGS+=" --action_env=GIT_TAG_OVERRIDE"
+fi
+
 echo "Using Bazel flags: ${BAZEL_FLAGS}"
 
 PIP_BUILD_TARGET="//tensorflow/tools/pip_package:build_pip_package"
diff --git a/tensorflow/tools/ci_build/builds/test_user_ops.sh b/tensorflow/tools/ci_build/builds/test_user_ops.sh
index caa3a40817..c342367bac 100755
--- a/tensorflow/tools/ci_build/builds/test_user_ops.sh
+++ b/tensorflow/tools/ci_build/builds/test_user_ops.sh
@@ -213,27 +213,34 @@ USER_OP=$(echo "${USER_OP_SO}" | sed -e 's/\.so//')
 echo "Invoking user op ${USER_OP} defined in file ${USER_OP_SO} "\
 "via pip installation"
 
-ORIG_OUTPUT=$("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))")
-
-# Format OUTPUT for analysis
-if [[ -z $(echo "${ORIG_OUTPUT}" | grep -o ',') ]]; then
-  if [[ ${IS_MAC} == "1" ]]; then
-    OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -E -e 's/[ \t]+/,/g')
+function run_op() {
+  local ORIG_OUTPUT=$1
+  local ADDITIONAL_LOG=$2
+
+  # Format OUTPUT for analysis
+  if [[ -z $(echo "${ORIG_OUTPUT}" | grep -o ',') ]]; then
+    if [[ ${IS_MAC} == "1" ]]; then
+      local OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -E -e 's/[ \t]+/,/g')
+    else
+      local OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -r -e 's/[ \t]+/,/g')
+    fi
   else
-    OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -r -e 's/[ \t]+/,/g')
+    local OUTPUT="${ORIG_OUTPUT}"
   fi
-else
-  OUTPUT="${ORIG_OUTPUT}"
-fi
 
-EQUALS_EXPECTED=$("${PYTHON_BIN_PATH}" -c "print(${OUTPUT} == ${EXPECTED_OUTPUT})")
+  local EQUALS_EXPECTED=$("${PYTHON_BIN_PATH}" -c "print(${OUTPUT} == ${EXPECTED_OUTPUT})")
 
-if [[ "${EQUALS_EXPECTED}" != "True" ]]; then
-  die "FAILED: Output from user op (${OUTPUT}) does not match expected "\
-"output ${EXPECTED_OUTPUT}"
-else
-  echo "Output from user op (${OUTPUT}) matches expected output"
-fi
+  if [[ "${EQUALS_EXPECTED}" != "True" ]]; then
+    local ERROR="FAILED: Output from user op (${OUTPUT}) does not match expected "\
+  "output ${EXPECTED_OUTPUT}"${ADDITIONAL_LOG}
+    die ${ERROR}
+  else
+    echo "Output from user op (${OUTPUT}) matches expected output"
+  fi
+}
+
+run_op $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))")
+run_op $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; tf.enable_eager_execution(); print(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT}))") " in eager mode"
 
 popd
 
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh b/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
index dbf376be6f..2a9f295188 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
@@ -30,7 +30,10 @@ export PYTHON_BIN_PATH=`which python2`
 yes "" | $PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
+# Setting KMP_BLOCKTIME to 0 lets OpenMP threads to sleep right after parallel execution
+# in an MKL primitive. This reduces the effects of an oversubscription of OpenMP threads
+# caused by executing multiple tests concurrently.
 bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test --test_lang_filters=py -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only \
-    --config=mkl --config=opt --test_output=errors -- \
+    --config=mkl --test_env=KMP_BLOCKTIME=0 --config=opt --test_output=errors -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
index 97829892b1..3b437d3c58 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
@@ -31,6 +31,9 @@ IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Prog
 :: Set ctest binary location.
 IF DEFINED CTEST_EXE (ECHO CTEST_EXE is set to %CTEST_EXE%) ELSE (SET CTEST_EXE="C:\Program Files\cmake\bin\ctest.exe")
 
+:: Install absl-py.
+%PIP_EXE% install --upgrade absl-py
+
 :: Run the CMAKE build to build the pip package.
 CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\gpu\cmake\run_build.bat
 if %errorlevel% neq 0 exit /b %errorlevel%
@@ -40,9 +43,6 @@ DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file
 set /p WHEEL_FILENAME=<wheel_filename_file
 del wheel_filename_file
 
-:: Install absl-py.
-%PIP_EXE% install --upgrade absl-py
-
 :: Install the pip package.
 echo Installing PIP package...
 %PIP_EXE% install --upgrade --no-deps %WHEEL_FILENAME% -v -v
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index b3dbe475d2..390d7442c3 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -72,7 +72,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.7 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index 037d13116e..c65e0b72bc 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -3,7 +3,7 @@ FROM tensorflow/tensorflow:latest-devel
 LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
 
 # These arguments are parameterized. Use --build-args to override.
-ARG TF_BRANCH=r1.7
+ARG TF_BRANCH=r1.8
 ARG WHL_DIR=/whl
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index bfb96da58d..293028d229 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -81,7 +81,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.7 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index 78d511969e..73dee98bae 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -139,7 +139,7 @@ def configure(src_base_path, gen_path, debug=False):
     print("gen_git_source.py: spec is %r" % spec)
 
 
-def get_git_version(git_base_path):
+def get_git_version(git_base_path, git_tag_override):
   """Get the git version from the repository.
 
   This function runs `git describe ...` in the path given as `git_base_path`.
@@ -152,6 +152,9 @@ def get_git_version(git_base_path):
 
   Args:
     git_base_path: where the .git directory is located
+    git_tag_override: Override the value for the git tag. This is useful for
+      releases where we want to build the release before the git tag is
+      created.
   Returns:
     A bytestring representing the git version
   """
@@ -161,6 +164,14 @@ def get_git_version(git_base_path):
         "git", str("--git-dir=%s/.git" % git_base_path),
         str("--work-tree=" + git_base_path), "describe", "--long", "--tags"
     ]).strip())
+    if git_tag_override:
+      split_val = val.split("-")
+      if len(split_val) != 3:
+        raise Exception(
+            ("Expected git version in format 'TAG-COMMITS AFTER TAG-HASH' "
+             "but got '%s'") % val)
+      split_val[0] = git_tag_override
+      val = bytes("-".join(split_val))
     return val if val else unknown_label
   except (subprocess.CalledProcessError, OSError):
     return unknown_label
@@ -178,7 +189,15 @@ def write_version_info(filename, git_version):
   contents = """/*  Generated by gen_git_source.py  */
 #include <string>
 const char* tf_git_version() {return "%s";}
-const char* tf_compiler_version() {return __VERSION__;}
+const char* tf_compiler_version() {
+#ifdef _MSC_VER
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+  return "MSVC " TOSTRING(_MSC_FULL_VER);
+#else
+  return __VERSION__;
+#endif
+}
 const int tf_cxx11_abi_flag() {
 #ifdef _GLIBCXX_USE_CXX11_ABI
   return _GLIBCXX_USE_CXX11_ABI;
@@ -197,7 +216,7 @@ const int tf_monolithic_build() {
   open(filename, "w").write(contents)
 
 
-def generate(arglist):
+def generate(arglist, git_tag_override=None):
   """Generate version_info.cc as given `destination_file`.
 
   Args:
@@ -217,6 +236,10 @@ def generate(arglist):
   `ref_symlink` is unused in this script but passed, because the build
     system uses that file to detect when commits happen.
 
+    git_tag_override: Override the value for the git tag. This is useful for
+      releases where we want to build the release before the git tag is
+      created.
+
   Raises:
     RuntimeError: If ./configure needs to be run, RuntimeError will be raised.
   """
@@ -234,11 +257,11 @@ def generate(arglist):
       raise RuntimeError(
           "Run ./configure again, branch was '%s' but is now '%s'" %
           (old_branch, new_branch))
-    git_version = get_git_version(data["path"])
+    git_version = get_git_version(data["path"], git_tag_override)
   write_version_info(dest_file, git_version)
 
 
-def raw_generate(output_file):
+def raw_generate(output_file, source_dir, git_tag_override=None):
   """Simple generator used for cmake/make build systems.
 
   This does not create any symlinks. It requires the build system
@@ -246,9 +269,13 @@ def raw_generate(output_file):
 
   Args:
     output_file: Output filename for the version info cc
+    source_dir: Base path of the source code
+    git_tag_override: Override the value for the git tag. This is useful for
+      releases where we want to build the release before the git tag is
+      created.
   """
 
-  git_version = get_git_version(".")
+  git_version = get_git_version(source_dir, git_tag_override)
   write_version_info(output_file, git_version)
 
 
@@ -270,6 +297,11 @@ parser.add_argument(
     "--gen_root_path", type=str,
     help="Root path to place generated git files (created by --configure).")
 
+parser.add_argument(
+    "--git_tag_override", type=str,
+    help="Override git tag value in the __git_version__ string. Useful when "
+         "creating release builds before the release tag is created.")
+
 parser.add_argument(
     "--generate",
     type=str,
@@ -281,6 +313,11 @@ parser.add_argument(
     type=str,
     help="Generate version_info.cc (simpler version used for cmake/make)")
 
+parser.add_argument(
+    "--source_dir",
+    type=str,
+    help="Base path of the source code (used for cmake/make)")
+
 args = parser.parse_args()
 
 if args.configure is not None:
@@ -288,9 +325,12 @@ if args.configure is not None:
     raise RuntimeError("Must pass --gen_root_path arg when running --configure")
   configure(args.configure, args.gen_root_path, debug=args.debug)
 elif args.generate is not None:
-  generate(args.generate)
+  generate(args.generate, args.git_tag_override)
 elif args.raw_generate is not None:
-  raw_generate(args.raw_generate)
+  source_path = "."
+  if args.source_dir is not None:
+    source_path = args.source_dir
+  raw_generate(args.raw_generate, source_path, args.git_tag_override)
 else:
   raise RuntimeError("--configure or --generate or --raw_generate "
                      "must be used")
diff --git a/tensorflow/tools/git/gen_git_source.sh b/tensorflow/tools/git/gen_git_source.sh
index db20bb00e8..cd128af6b3 100755
--- a/tensorflow/tools/git/gen_git_source.sh
+++ b/tensorflow/tools/git/gen_git_source.sh
@@ -28,7 +28,15 @@ fi
 cat <<EOF > ${OUTPUT_FILENAME}
 #include <string>
 const char* tf_git_version() {return "${GIT_VERSION}";}
-const char* tf_compiler_version() {return __VERSION__;}
+const char* tf_compiler_version() {
+#ifdef _MSC_VER
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+  return "MSVC " TOSTRING(_MSC_FULL_VER);
+#else
+  return __VERSION__;
+#endif
+}
 const int tf_cxx11_abi_flag() {
 #ifdef _GLIBCXX_USE_CXX11_ABI
   return _GLIBCXX_USE_CXX11_ABI;
diff --git a/tensorflow/tools/graph_transforms/transform_graph.cc b/tensorflow/tools/graph_transforms/transform_graph.cc
index 28387c2b48..8ce8f5e24b 100644
--- a/tensorflow/tools/graph_transforms/transform_graph.cc
+++ b/tensorflow/tools/graph_transforms/transform_graph.cc
@@ -24,6 +24,9 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/file_utils.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
+#if !defined(PLATFORM_WINDOWS)
+#include <pwd.h>
+#endif
 
 namespace tensorflow {
 namespace graph_transforms {
@@ -130,16 +133,64 @@ Status ParseTransformParameters(const string& transforms_string,
   return Status::OK();
 }
 
+std::string ExpandPath(const std::string& path_string) {
+#if defined(PLATFORM_WINDOWS)
+  return path_string;
+#else
+  if (path_string.empty() || path_string[0] != '~') {
+    return path_string;
+  }
+
+  const char* home = NULL;
+  std::string::size_type prefix = path_string.find_first_of('/');
+  if (path_string.length() == 1 || prefix == 1) {
+    // The value of $HOME, e.g., ~/foo
+    home = getenv("HOME");
+    if (!home) {
+      // If HOME is not available, get uid
+      struct passwd* pw = getpwuid(getuid());
+      if (pw) {
+        home = pw->pw_dir;
+      }
+    }
+  } else {
+    // The value of ~user, e.g., ~user/foo
+    std::string user(path_string, 1, (prefix == std::string::npos)
+                                         ? std::string::npos
+                                         : prefix - 1);
+    struct passwd* pw = getpwnam(user.c_str());
+    if (pw) {
+      home = pw->pw_dir;
+    }
+  }
+
+  if (!home) {
+    return path_string;
+  }
+
+  string path(home);
+  if (prefix == std::string::npos) {
+    return path;
+  }
+
+  if (path.length() == 0 || path[path.length() - 1] != '/') {
+    path += '/';
+  }
+  path += path_string.substr(prefix + 1);
+  return path;
+#endif
+}
+
 int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
-  string in_graph = "";
-  string out_graph = "";
+  string in_graph_string = "";
+  string out_graph_string = "";
   string inputs_string = "";
   string outputs_string = "";
   string transforms_string = "";
   bool output_as_text = false;
   std::vector<Flag> flag_list = {
-      Flag("in_graph", &in_graph, "input graph file name"),
-      Flag("out_graph", &out_graph, "output graph file name"),
+      Flag("in_graph", &in_graph_string, "input graph file name"),
+      Flag("out_graph", &out_graph_string, "output graph file name"),
       Flag("inputs", &inputs_string, "inputs"),
       Flag("outputs", &outputs_string, "outputs"),
       Flag("transforms", &transforms_string, "list of transforms"),
@@ -166,11 +217,11 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
     LOG(ERROR) << "Unknown argument " << argv[1] << ".\n" << usage;
     return -1;
   }
-  if (in_graph.empty()) {
+  if (in_graph_string.empty()) {
     LOG(ERROR) << "in_graph graph can't be empty.\n" << usage;
     return -1;
   }
-  if (out_graph.empty()) {
+  if (out_graph_string.empty()) {
     LOG(ERROR) << "out_graph graph can't be empty.\n" << usage;
     return -1;
   }
@@ -179,6 +230,9 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
     return -1;
   }
 
+  string in_graph = ExpandPath(in_graph_string);
+  string out_graph = ExpandPath(out_graph_string);
+
   std::vector<string> inputs = str_util::Split(inputs_string, ',');
   std::vector<string> outputs = str_util::Split(outputs_string, ',');
   TransformParameters transform_params;
@@ -197,7 +251,7 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
   GraphDef graph_def;
   Status load_status = LoadTextOrBinaryGraphFile(in_graph, &graph_def);
   if (!load_status.ok()) {
-    LOG(ERROR) << "Loading graph '" << in_graph << "' failed with "
+    LOG(ERROR) << "Loading graph '" << in_graph_string << "' failed with "
                << load_status.error_message();
     LOG(ERROR) << usage;
     return -1;
@@ -219,7 +273,7 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
     save_status = WriteBinaryProto(Env::Default(), out_graph, graph_def);
   }
   if (!save_status.ok()) {
-    LOG(ERROR) << "Saving graph '" << out_graph << "' failed with "
+    LOG(ERROR) << "Saving graph '" << out_graph_string << "' failed with "
                << save_status.error_message();
     return -1;
   }
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 211f93296b..f84a91d009 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -31,7 +31,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.7.0'
+_VERSION = '1.8.0-rc0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index bbef4b9e5f..8b26a32eac 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -167,11 +167,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "gemmlowp",
       urls = [
-          "https://mirror.bazel.build/github.com/google/gemmlowp/archive/7c7c744640ddc3d0af18fb245b4d23228813a71b.zip",
-          "https://github.com/google/gemmlowp/archive/7c7c744640ddc3d0af18fb245b4d23228813a71b.zip",
+          # TODO (yongtang): uncomment once mirror.bazel.build is propagated.
+          # "https://mirror.bazel.build/github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
+          "https://github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
       ],
-      sha256 = "b852cc90259a7357c8a323f108f2cec6e85979fc3b18b5590b99e0130044b2cf",
-      strip_prefix = "gemmlowp-7c7c744640ddc3d0af18fb245b4d23228813a71b",
+      sha256 = "b87faa7294dfcc5d678f22a59d2c01ca94ea1e2a3b488c38a95a67889ed0a658",
+      strip_prefix = "gemmlowp-38ebac7b059e84692f53e5938f97a9943c120d98",
   )
 
   tf_http_archive(
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
index aa178fa8ca..36f5aa5bde 100644
--- a/third_party/repo.bzl
+++ b/third_party/repo.bzl
@@ -17,6 +17,7 @@
 _SINGLE_URL_WHITELIST = depset([
     "arm_compiler",
     "ortools_archive",
+    "gemmlowp",
 ])
 
 def _is_windows(ctx):
@@ -68,7 +69,7 @@ def _apply_delete(ctx, paths):
   _execute_and_check_ret_code(ctx, cmd)
 
 def _tf_http_archive(ctx):
-  if ("mirror.bazel.build" not in ctx.attr.urls[0] or
+  if ("mirror.bazel.build" not in ctx.attr.urls[0] and
       (len(ctx.attr.urls) < 2 and
        ctx.attr.name not in _SINGLE_URL_WHITELIST)):
     fail("tf_http_archive(urls) must have redundant URLs. The " +
-- 
GitLab


From 1bb16a262900dce73e8d757d9ad29feed0c878ad Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 21:46:39 -0700
Subject: [PATCH 343/450] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 194033378

---
 tensorflow/go/op/wrappers.go | 3738 +++++++++++++++++-----------------
 1 file changed, 1869 insertions(+), 1869 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index c31ca8b67a..d038846c4f 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -2243,81 +2243,170 @@ func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Ou
 	return op.Output(0)
 }
 
-// Returns the complex conjugate of a complex number.
+// Gather slices from `params` into a Tensor with shape specified by `indices`.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// complex numbers that are the complex conjugate of each element in `input`. The
-// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-// real part and *b* is the imaginary part.
+// `indices` is an K-dimensional integer tensor, best thought of as a
+// (K-1)-dimensional tensor of indices into `params`, where each element defines a
+// slice of `params`:
 //
-// The complex conjugate returned by this operation is of the form \\(a - bj\\).
+//     output[i_0, ..., i_{K-2}] = params[indices[i0, ..., i_{K-2}]]
 //
-// For example:
+// Whereas in @{tf.gather} `indices` defines slices into the first
+// dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
+// first `N` dimensions of `params`, where `N = indices.shape[-1]`.
+//
+// The last dimension of `indices` can be at most the rank of
+// `params`:
+//
+//     indices.shape[-1] <= params.rank
+//
+// The last dimension of `indices` corresponds to elements
+// (if `indices.shape[-1] == params.rank`) or slices
+// (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
+// of `params`.  The output tensor has shape
+//
+//     indices.shape[:-1] + params.shape[indices.shape[-1]:]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, a 0 is stored in the
+// corresponding output value.
+//
+// Some examples below.
+//
+// Simple indexing into a matrix:
 //
+// ```python
+//     indices = [[0, 0], [1, 1]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = ['a', 'd']
 // ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+//
+// Slice indexing into a matrix:
+//
+// ```python
+//     indices = [[1], [0]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [['c', 'd'], ['a', 'b']]
 // ```
-func Conj(scope *Scope, input tf.Output) (output tf.Output) {
+//
+// Indexing into a 3-tensor:
+//
+// ```python
+//     indices = [[1]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[['a1', 'b1'], ['c1', 'd1']]]
+//
+//
+//     indices = [[0, 1], [1, 0]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [['c0', 'd0'], ['a1', 'b1']]
+//
+//
+//     indices = [[0, 0, 1], [1, 0, 1]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = ['b0', 'b1']
+// ```
+//
+// Batched indexing into a matrix:
+//
+// ```python
+//     indices = [[[0, 0]], [[0, 1]]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [['a'], ['b']]
+// ```
+//
+// Batched slice indexing into a matrix:
+//
+// ```python
+//     indices = [[[1]], [[0]]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [[['c', 'd']], [['a', 'b']]]
+// ```
+//
+// Batched indexing into a 3-tensor:
+//
+// ```python
+//     indices = [[[1]], [[0]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[[['a1', 'b1'], ['c1', 'd1']]],
+//               [[['a0', 'b0'], ['c0', 'd0']]]]
+//
+//     indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[['c0', 'd0'], ['a1', 'b1']],
+//               [['a0', 'b0'], ['c1', 'd1']]]
+//
+//
+//     indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [['b0', 'b1'], ['d0', 'c1']]
+// ```
+//
+// Arguments:
+//	params: The tensor from which to gather values.
+//	indices: Index tensor.
+//
+// Returns Values from `params` gathered from indices given by `indices`, with
+// shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
+func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Conj",
+		Type: "GatherNd",
 		Input: []tf.Input{
-			input,
+			params, indices,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
-type ResourceSparseApplyMomentumAttr func(optionalAttr)
+// GatherAttr is an optional argument to Gather.
+type GatherAttr func(optionalAttr)
 
-// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
+// GatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func GatherValidateIndices(value bool) GatherAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+// Gather slices from `params` according to `indices`.
 //
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
-// Set use_nesterov = True if you want to use Nesterov momentum.
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
 //
-// That is for rows we have grad for, we update var and accum as follows:
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
 //
-// accum = accum * momentum + grad
-// var -= lr * accum
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
+// If `indices` is a permutation and `len(indices) == params.shape[0]` then
+// this operation will permute `params` accordingly.
 //
-// Returns the created operation.
-func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
+// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
+// `indices` are always validated to be within range. If assigned to GPU,
+// out-of-bound indices result in safe but unspecified behavior, which may include
+// raising an error.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+// </div>
+func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -2326,13 +2415,14 @@ func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyMomentum",
+		Type: "Gather",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
+			params, indices,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // Clips tensor values to a specified min and max.
@@ -4548,62 +4638,6 @@ func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
-type HistogramFixedWidthAttr func(optionalAttr)
-
-// HistogramFixedWidthDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT32
-func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Return histogram of values.
-//
-// Given the tensor `values`, this operation returns a rank 1 histogram counting
-// the number of entries in `values` that fall into every bin.  The bins are
-// equal width and determined by the arguments `value_range` and `nbins`.
-//
-// ```python
-// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-// nbins = 5
-// value_range = [0.0, 5.0]
-// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-//
-// with tf.get_default_session() as sess:
-//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
-//   variables.global_variables_initializer().run()
-//   sess.run(hist) => [2, 1, 1, 0, 2]
-// ```
-//
-// Arguments:
-//	values: Numeric `Tensor`.
-//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
-// values <= value_range[0] will be mapped to hist[0],
-// values >= value_range[1] will be mapped to hist[-1].
-//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
-//
-// Returns A 1-D `Tensor` holding histogram of values.
-func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "HistogramFixedWidth",
-		Input: []tf.Input{
-			values, value_range, nbins,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Adds Tensor 'bias' to Tensor 'input' for Quantized types.
 //
 // Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
@@ -7020,38 +7054,107 @@ func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_ke
 	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
-// Real-valued fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most dimension of `input`.
+// DecodeRawAttr is an optional argument to DecodeRaw.
+type DecodeRawAttr func(optionalAttr)
+
+// DecodeRawLittleEndian sets the optional little_endian attribute to value.
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
-// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
-// followed by the `fft_length / 2` positive-frequency terms.
-//
-// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// value: Whether the input `bytes` are in little-endian order.
+// Ignored for `out_type` values that are stored in a single byte like
+// `uint8`.
+// If not specified, defaults to true
+func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+	return func(m optionalAttr) {
+		m["little_endian"] = value
+	}
+}
+
+// Reinterpret the bytes of a string as a vector of numbers.
 //
 // Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//	bytes: All the elements must have the same length.
 //
-// Returns A complex64 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
-//   frequency components of its 1D Fourier transform.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft
-// @end_compatibility
-func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns A Tensor with one more dimension than the input `bytes`.  The
+// added dimension will have size equal to the length of the elements
+// of `bytes` divided by the number of bytes to represent `out_type`.
+func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RFFT",
+		Type: "DecodeRaw",
 		Input: []tf.Input{
-			input, fft_length,
+			bytes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Copy a tensor setting everything outside a central band in each innermost matrix
+//
+// to zero.
+//
+// The `band` part is computed as follows:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor with the same shape where
+//
+// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
+//
+// The indicator function
+//
+// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+//                  (num_upper < 0 || (n-m) <= num_upper)`.
+//
+// For example:
+//
+// ```
+// # if 'input' is [[ 0,  1,  2, 3]
+//                  [-1,  0,  1, 2]
+//                  [-2, -1,  0, 1]
+//                  [-3, -2, -1, 0]],
+//
+// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+//                                        [-1,  0,  1, 2]
+//                                        [ 0, -1,  0, 1]
+//                                        [ 0,  0, -1, 0]],
+//
+// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+//                                       [-1,  0,  1, 0]
+//                                       [-2, -1,  0, 1]
+//                                       [ 0, -2, -1, 0]]
+// ```
+//
+// Useful special cases:
+//
+// ```
+//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
+// ```
+//
+// Arguments:
+//	input: Rank `k` tensor.
+//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
+// lower triangle.
+//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
+// entire upper triangle.
+//
+// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
+func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixBandPart",
+		Input: []tf.Input{
+			input, num_lower, num_upper,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -8207,63 +8310,6 @@ func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// GatherAttr is an optional argument to Gather.
-type GatherAttr func(optionalAttr)
-
-// GatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func GatherValidateIndices(value bool) GatherAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Gather slices from `params` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-//
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
-//
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
-//
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
-//
-// If `indices` is a permutation and `len(indices) == params.shape[0]` then
-// this operation will permute `params` accordingly.
-//
-// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
-// `indices` are always validated to be within range. If assigned to GPU,
-// out-of-bound indices result in safe but unspecified behavior, which may include
-// raising an error.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-// </div>
-func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Gather",
-		Input: []tf.Input{
-			params, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns the truth value of (x != y) element-wise.
 //
 // *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
@@ -8386,6 +8432,98 @@ func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ..
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
+type ResourceSparseApplyMomentumAttr func(optionalAttr)
+
+// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+//
+// Set use_nesterov = True if you want to use Nesterov momentum.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+//
+// accum = accum * momentum + grad
+// var -= lr * accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, indices, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns the complex conjugate of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// complex numbers that are the complex conjugate of each element in `input`. The
+// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
+// real part and *b* is the imaginary part.
+//
+// The complex conjugate returned by this operation is of the form \\(a - bj\\).
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+// ```
+func Conj(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Conj",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ResizeBilinearAttr is an optional argument to ResizeBilinear.
 type ResizeBilinearAttr func(optionalAttr)
 
@@ -9799,167 +9937,104 @@ func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, o
 	return op.Output(0)
 }
 
-// Inverse fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform over the
-// inner-most dimension of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its inverse 1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft
-// @end_compatibility
-func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IFFT",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Generates values in an interval.
-//
-// A sequence of `num` evenly-spaced values are generated beginning at `start`.
-// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-// so that the last one is exactly `stop`.
-//
-// For example:
-//
-// ```
-// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
-// ```
-//
-// Arguments:
-//	start: First entry in the range.
-//	stop: Last entry in the range.
-//	num: Number of values to generate.
-//
-// Returns 1-D. The generated values.
-func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LinSpace",
-		Input: []tf.Input{
-			start, stop, num,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
-type DestroyResourceOpAttr func(optionalAttr)
+// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
+type DecodeAndCropJpegAttr func(optionalAttr)
 
-// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
+// DecodeAndCropJpegChannels sets the optional channels attribute to value.
 //
-// value: whether to ignore the error when the resource
-// doesn't exist.
-// If not specified, defaults to true
-func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
 	return func(m optionalAttr) {
-		m["ignore_lookup_error"] = value
+		m["channels"] = value
 	}
 }
 
-// Deletes the resource specified by the handle.
-//
-// All subsequent operations using the resource will result in a NotFound
-// error status.
-//
-// Arguments:
-//	resource: handle to the resource to delete.
+// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
 //
-// Returns the created operation.
-func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DestroyResourceOp",
-		Input: []tf.Input{
-			resource,
-		},
-		Attrs: attrs,
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// LRNAttr is an optional argument to LRN.
-type LRNAttr func(optionalAttr)
-
-// LRNDepthRadius sets the optional depth_radius attribute to value.
+// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
 //
-// value: 0-D.  Half-width of the 1-D normalization window.
-// If not specified, defaults to 5
-func LRNDepthRadius(value int64) LRNAttr {
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
 	return func(m optionalAttr) {
-		m["depth_radius"] = value
+		m["fancy_upscaling"] = value
 	}
 }
 
-// LRNBias sets the optional bias attribute to value.
+// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
 //
-// value: An offset (usually positive to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNBias(value float32) LRNAttr {
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
 	return func(m optionalAttr) {
-		m["bias"] = value
+		m["try_recover_truncated"] = value
 	}
 }
 
-// LRNAlpha sets the optional alpha attribute to value.
+// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
 //
-// value: A scale factor, usually positive.
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
 // If not specified, defaults to 1
-func LRNAlpha(value float32) LRNAttr {
+func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
 	return func(m optionalAttr) {
-		m["alpha"] = value
+		m["acceptable_fraction"] = value
 	}
 }
 
-// LRNBeta sets the optional beta attribute to value.
+// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
 //
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNBeta(value float32) LRNAttr {
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
 	return func(m optionalAttr) {
-		m["beta"] = value
+		m["dct_method"] = value
 	}
 }
 
-// Local Response Normalization.
+// Decode and Crop a JPEG-encoded image to a uint8 tensor.
 //
-// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-// dimension), and each vector is normalized independently.  Within a given vector,
-// each component is divided by the weighted, squared sum of inputs within
-// `depth_radius`.  In detail,
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
 //
-//     sqr_sum[a, b, c, d] =
-//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-//     output = input / (bias + alpha * sqr_sum) ** beta
+// Accepted values are:
 //
-// For details, see [Krizhevsky et al., ImageNet classification with deep
-// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// It is equivalent to a combination of decode and crop, but much faster by only
+// decoding partial jpeg image.
 //
 // Arguments:
-//	input: 4-D.
-func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+//	contents: 0-D.  The JPEG-encoded image.
+//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9968,9 +10043,9 @@ func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRN",
+		Type: "DecodeAndCropJpeg",
 		Input: []tf.Input{
-			input,
+			contents, crop_window,
 		},
 		Attrs: attrs,
 	}
@@ -9978,249 +10053,273 @@ func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output)
 	return op.Output(0)
 }
 
-// Creates a dataset that zips together `input_datasets`.
-func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ZipDataset",
-		Input: []tf.Input{
-			tf.OutputList(input_datasets),
-		},
-		Attrs: attrs,
+// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
+type AllCandidateSamplerAttr func(optionalAttr)
+
+// AllCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
-type ResourceSparseApplyAdagradAttr func(optionalAttr)
-
-// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
+// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["seed2"] = value
 	}
 }
 
-// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
+// Generates labels for candidate sampling with a learned unigram distribution.
 //
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to produce.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
 //
-// Returns the created operation.
-func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagrad",
+		Type: "AllCandidateSampler",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// 2D real-valued fast Fourier transform.
+// Adds two `SparseTensor` objects to produce another `SparseTensor`.
 //
-// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 2 dimensions of `input`.
+// The input `SparseTensor` objects' indices are assumed ordered in standard
+// lexicographic order.  If this is not the case, before this step run
+// `SparseReorder` to restore index ordering.
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
+// By default, if two values sum to zero at some index, the output `SparseTensor`
+// would still include that particular location in its index, storing a zero in the
+// corresponding value slot.  To override this, callers can specify `thresh`,
+// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
+// corresponding value and index would then not be included.  In particular,
+// `thresh == 0` (default) means everything is kept and actual thresholding happens
+// only for a positive value.
 //
-// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// In the following shapes, `nnz` is the count after taking `thresh` into account.
 //
 // Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft2
-// @end_compatibility
-func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
+//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
+//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
+//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
+// pair takes space.
+func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RFFT2D",
+		Type: "SparseAdd",
 		Input: []tf.Input{
-			input, fft_length,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResizeAreaAttr is an optional argument to ResizeArea.
-type ResizeAreaAttr func(optionalAttr)
+// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
+type OrderedMapPeekAttr func(optionalAttr)
 
-// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
+// OrderedMapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
+// REQUIRES: value >= 0
+func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["capacity"] = value
 	}
 }
 
-// Resize `images` to `size` using area interpolation.
-//
-// Input images can be of different types but output images are always float.
-//
-// The range of pixel values for the output image might be slightly different
-// from the range for the input image because of limited numerical precision.
-// To guarantee an output range, for example `[0.0, 1.0]`, apply
-// `tf.clip_by_value` to the output.
-//
-// Each output pixel is computed by first transforming the pixel's footprint into
-// the input tensor and then averaging the pixels that intersect the footprint. An
-// input pixel's contribution to the average is weighted by the fraction of its
-// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
+// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+// REQUIRES: value >= 0
+func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified key.  If the
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
+// underlying container does not contain this key
+// this op will block until it does.   This Op is optimized for
+// performance.
+func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeArea",
+		Type: "OrderedMapPeek",
 		Input: []tf.Input{
-			images, size,
+			key, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapPeek", err)
+		return
+	}
+	return values
 }
 
-// Pads a tensor with zeros.
-//
-// This operation pads a `input` with zeros according to the `paddings` you
-// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many zeros to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-// in that dimension.
+// Inverse fast Fourier transform.
 //
-// The padded size of each dimension D of the output is:
+// Computes the inverse 1-dimensional discrete Fourier transform over the
+// inner-most dimension of `input`.
 //
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+// Arguments:
+//	input: A complex64 tensor.
 //
-// For example:
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its inverse 1D Fourier transform.
 //
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
-func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft
+// @end_compatibility
+func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Pad",
+		Type: "IFFT",
 		Input: []tf.Input{
-			input, paddings,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Checks whether a resource handle-based variable has been initialized.
+// Generates values in an interval.
+//
+// A sequence of `num` evenly-spaced values are generated beginning at `start`.
+// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
+// so that the last one is exactly `stop`.
+//
+// For example:
+//
+// ```
+// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
+// ```
 //
 // Arguments:
-//	resource: the input resource handle.
+//	start: First entry in the range.
+//	stop: Last entry in the range.
+//	num: Number of values to generate.
 //
-// Returns a scalar boolean which is true if the variable has been
-// initialized.
-func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
+// Returns 1-D. The generated values.
+func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "VarIsInitializedOp",
+		Type: "LinSpace",
 		Input: []tf.Input{
-			resource,
+			start, stop, num,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
-type StatelessRandomUniformAttr func(optionalAttr)
+// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
+type DestroyResourceOpAttr func(optionalAttr)
 
-// StatelessRandomUniformDtype sets the optional dtype attribute to value.
+// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+// value: whether to ignore the error when the resource
+// doesn't exist.
+// If not specified, defaults to true
+func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["ignore_lookup_error"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// Deletes the resource specified by the handle.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// All subsequent operations using the resource will result in a NotFound
+// error status.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	resource: handle to the resource to delete.
 //
-// Returns Random values with specified shape.
-func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+// Returns the created operation.
+func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10229,23 +10328,437 @@ func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optio
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniform",
+		Type: "DestroyResourceOp",
 		Input: []tf.Input{
-			shape, seed,
+			resource,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Makes its input available to the next iteration.
-//
-// Arguments:
-//	data: The tensor to be made available to the next iteration.
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// Returns The same tensor as `data`.
-func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
+//
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyRMSProp",
+		Input: []tf.Input{
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns the truth value of (x > y) element-wise.
+//
+// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Greater",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
+
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+//
+// value: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+// If not specified, defaults to 0.1
+func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["min_object_covered"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within in this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a single randomly distorted bounding box for an image.
+//
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
+//
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
+//
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
+//
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
+//
+// Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SampleDistortedBoundingBox",
+		Input: []tf.Input{
+			image_size, bounding_boxes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
+
+// LRNDepthRadius sets the optional depth_radius attribute to value.
+//
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
+}
+
+// LRNBias sets the optional bias attribute to value.
+//
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNAlpha sets the optional alpha attribute to value.
+//
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNBeta(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Local Response Normalization.
+//
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
+//
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
+//
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+//
+// Arguments:
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LRN",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that zips together `input_datasets`.
+func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ZipDataset",
+		Input: []tf.Input{
+			tf.OutputList(input_datasets),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
+type ResourceSparseApplyAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
+type StatelessRandomUniformAttr func(optionalAttr)
+
+// StatelessRandomUniformDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniform",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Makes its input available to the next iteration.
+//
+// Arguments:
+//	data: The tensor to be made available to the next iteration.
+//
+// Returns The same tensor as `data`.
+func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10804,47 +11317,42 @@ func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output
 	return op.Output(0)
 }
 
-// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
-type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+// ResizeAreaAttr is an optional argument to ResizeArea.
+type ResizeAreaAttr func(optionalAttr)
 
-// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
 // If not specified, defaults to false
-func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
+// Resize `images` to `size` using area interpolation.
 //
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
+// Input images can be of different types but output images are always float.
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+// The range of pixel values for the output image might be slightly different
+// from the range for the input image because of limited numerical precision.
+// To guarantee an output range, for example `[0.0, 1.0]`, apply
+// `tf.clip_by_value` to the output.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// Each output pixel is computed by first transforming the pixel's footprint into
+// the input tensor and then averaging the pixels that intersect the footprint. An
+// input pixel's contribution to the average is weighted by the fraction of its
+// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns the created operation.
-func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10853,184 +11361,113 @@ func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyRMSProp",
+		Type: "ResizeArea",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			images, size,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the truth value of (x > y) element-wise.
+// 2D real-valued fast Fourier transform.
 //
-// *NOTE*: `Greater` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 2 dimensions of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft2
+// @end_compatibility
+func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Greater",
+		Type: "RFFT2D",
 		Input: []tf.Input{
-			x, y,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
-type SampleDistortedBoundingBoxAttr func(optionalAttr)
-
-// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+// Pads a tensor with zeros.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+// in that dimension.
 //
-// value: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
-// If not specified, defaults to 0.1
-func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["min_object_covered"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+// The padded size of each dimension D of the output is:
 //
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
 //
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["area_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+// For example:
 //
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["max_attempts"] = value
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
-//
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
+	opspec := tf.OpSpec{
+		Type: "Pad",
+		Input: []tf.Input{
+			input, paddings,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
-//
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example,
-//
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
-//
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
-//
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
-//
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
+// Checks whether a resource handle-based variable has been initialized.
 //
 // Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
+//	resource: the input resource handle.
 //
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+// Returns a scalar boolean which is true if the variable has been
+// initialized.
+func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBox",
+		Type: "VarIsInitializedOp",
 		Input: []tf.Input{
-			image_size, bounding_boxes,
+			resource,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
 // Converts each string in the input Tensor to its hash mod by a number of buckets.
@@ -13698,6 +14135,44 @@ func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filenam
 	return scope.AddOperation(opspec)
 }
 
+// Real-valued fast Fourier transform.
+//
+// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most dimension of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+// followed by the `fft_length / 2` positive-frequency terms.
+//
+// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+//   frequency components of its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft
+// @end_compatibility
+func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RFFT",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // QuantizedReluAttr is an optional argument to QuantizedRelu.
 type QuantizedReluAttr func(optionalAttr)
 
@@ -15398,19 +15873,229 @@ func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
 // input tensor.
 //	padding: The type of padding algorithm to use.
 //
-// Returns The max pooled output tensor.
-func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
+// Returns The max pooled output tensor.
+func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolV2",
+		Input: []tf.Input{
+			input, ksize, strides,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SkipgramAttr is an optional argument to Skipgram.
+type SkipgramAttr func(optionalAttr)
+
+// SkipgramWindowSize sets the optional window_size attribute to value.
+//
+// value: The number of words to predict to the left and right of the target.
+// If not specified, defaults to 5
+func SkipgramWindowSize(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["window_size"] = value
+	}
+}
+
+// SkipgramMinCount sets the optional min_count attribute to value.
+//
+// value: The minimum number of word occurrences for it to be included in the
+// vocabulary.
+// If not specified, defaults to 5
+func SkipgramMinCount(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["min_count"] = value
+	}
+}
+
+// SkipgramSubsample sets the optional subsample attribute to value.
+//
+// value: Threshold for word occurrence. Words that appear with higher
+// frequency will be randomly down-sampled. Set to 0 to disable.
+// If not specified, defaults to 0.001
+func SkipgramSubsample(value float32) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["subsample"] = value
+	}
+}
+
+// Parses a text file and creates a batch of examples.
+//
+// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+//
+// Arguments:
+//	filename: The corpus's text file name.
+//	batch_size: The size of produced batch.
+//
+// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
+func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Skipgram",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+}
+
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
+
+// StringToNumberOutType sets the optional out_type attribute to value.
+//
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Converts each string in the input Tensor to the specified numeric type.
+//
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringToNumber",
+		Input: []tf.Input{
+			string_tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
+type ResourceApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyFtrlV2",
+		Input: []tf.Input{
+			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// TruncatedNormalAttr is an optional argument to TruncatedNormal.
+type TruncatedNormalAttr func(optionalAttr)
+
+// TruncatedNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with random truncated normal
+// values.
+func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolV2",
+		Type: "TruncatedNormal",
 		Input: []tf.Input{
-			input, ksize, strides,
+			shape,
 		},
 		Attrs: attrs,
 	}
@@ -15799,739 +16484,392 @@ func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_in
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Computes softplus: `log(exp(features) + 1)`.
-func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Softplus",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes exponential of x - 1 element-wise.
-//
-// I.e., \\(y = (\exp x) - 1\\).
-func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Expm1",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the number of records this Reader has produced.
-//
-// This is the same as the number of ReaderRead executions that have
-// succeeded.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderNumRecordsProducedV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the sum along segments of a tensor.
-//
-// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \sum_j data_j\\) where sum is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentSum",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that emits the lines of one or more text files.
-//
-// Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar containing the number of bytes to buffer.
-func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TextLineDataset",
-		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
-type CudnnRNNParamsSizeAttr func(optionalAttr)
-
-// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Computes size of weights that can be used by a Cudnn RNN model.
-//
-// Return the params size that can be used by the Cudnn RNN model. Subsequent
-// weight allocation and initialization should use this size.
-//
-// num_layers: Specifies the number of layers in the RNN model.
-// num_units: Specifies the size of the hidden state.
-// input_size: Specifies the size of the input state.
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//   The actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used.
-//   dir = (direction == bidirectional) ? 2 : 1
-// dropout: dropout probability. When set to 0., dropout is disabled.
-// seed: the 1st part of a seed to initialize dropout.
-// seed2: the 2nd part of a seed to initialize dropout.
-// params_size: The size of the params buffer that should be allocated and
-//   initialized for this RNN model. Note that this params buffer may not be
-//   compatible across GPUs. Please use CudnnRNNParamsWeights and
-//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
-//   across different runs.
-func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"T": T, "S": S}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CudnnRNNParamsSize",
-		Input: []tf.Input{
-			num_layers, num_units, input_size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes gradients for SparseSegmentMean.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
-//
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentMean op.
-//	indices: indices passed to the corresponding SparseSegmentMean op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
-func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanGrad",
-		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the set of files matching one or more glob patterns.
-//
-// Note that this routine only supports wildcard characters in the
-// basename portion of the pattern, not in the directory portion.
-// Note also that the order of filenames returned can be non-deterministic.
-//
-// Arguments:
-//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
-//
-// Returns A vector of matching filenames.
-func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+}
+
+// Computes softplus: `log(exp(features) + 1)`.
+func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatchingFiles",
+		Type: "Softplus",
 		Input: []tf.Input{
-			pattern,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x >= y) element-wise.
+// Computes exponential of x - 1 element-wise.
 //
-// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// I.e., \\(y = (\exp x) - 1\\).
+func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GreaterEqual",
+		Type: "Expm1",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv3DAttr is an optional argument to Conv3D.
-type Conv3DAttr func(optionalAttr)
-
-// Conv3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DDataFormat(value string) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DDilations(value []int64) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
-//
-// In signal processing, cross-correlation is a measure of similarity of
-// two waveforms as a function of a time-lag applied to one of them. This
-// is also known as a sliding dot product or sliding inner-product.
+// Returns the number of records this Reader has produced.
 //
-// Our Conv3D implements a form of cross-correlation.
+// This is the same as the number of ReaderRead executions that have
+// succeeded.
 //
 // Arguments:
-//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
-//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
-// out_channels]`. `in_channels` must match between `input` and `filter`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
+//	reader_handle: Handle to a Reader.
+func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Conv3D",
+		Type: "ReaderNumRecordsProducedV2",
 		Input: []tf.Input{
-			input, filter,
+			reader_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Adds up a SparseTensor and a dense Tensor, using these special rules:
+// Computes the sum along segments of a tensor.
 //
-// (1) Broadcasts the dense side to have the same shape as the sparse side, if
-//     eligible;
-// (2) Then, only the dense values pointed to by the indices of the SparseTensor
-//     participate in the cwise addition.
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// By these rules, the result is a logical SparseTensor with exactly the same
-// indices and shape, but possibly with different non-zero values.  The output of
-// this Op is the resultant non-zero values.
+// Computes a tensor such that
+// \\(output_i = \sum_j data_j\\) where sum is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
+// </div>
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseAdd",
+		Type: "SegmentSum",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Read an element from the TensorArray into output `value`.
+// Creates a dataset that emits the lines of one or more text files.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
-//
-// Returns The tensor that is read from the TensorArray.
-func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar containing the number of bytes to buffer.
+func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV3",
+		Type: "TextLineDataset",
 		Input: []tf.Input{
-			handle, index, flow_in,
+			filenames, compression_type, buffer_size,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizeV2Attr is an optional argument to QuantizeV2.
-type QuantizeV2Attr func(optionalAttr)
-
-// QuantizeV2Mode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func QuantizeV2Mode(value string) QuantizeV2Attr {
-	return func(m optionalAttr) {
-		m["mode"] = value
-	}
-}
-
-// QuantizeV2RoundMode sets the optional round_mode attribute to value.
-// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
-func QuantizeV2RoundMode(value string) QuantizeV2Attr {
-	return func(m optionalAttr) {
-		m["round_mode"] = value
-	}
-}
+// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
+type CudnnRNNParamsSizeAttr func(optionalAttr)
 
-// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.  The
-// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
-// when rounding float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-// if T == qint8, out[i] -= (range(T) + 1) / 2.0
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// Assume the input is type float and has a possible range of [0.0, 6.0] and the
-// output type is quint8 ([0, 255]). The min_range and max_range values should be
-// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-// value of the input by 255/6 and cast to quint8.
-//
-// If the output type was qint8 ([-128, 127]), the operation will additionally
-// subtract each value by 128 prior to casting, so that the range of values aligns
-// with the range of qint8.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = num_discrete_values / range
-// quantized = round(input * range_scale) - round(range_min * range_scale) +
-//   numeric_limits<T>::min()
-// quantized = max(quantized, numeric_limits<T>::min())
-// quantized = min(quantized, numeric_limits<T>::max())
-// ```
-//
-// The biggest difference between this and MIN_COMBINED is that the minimum range
-// is rounded first, before it's subtracted from the rounded value. With
-// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-// and dequantizing will introduce a larger and larger error.
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
-//
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
-//
-// Otherwise, if T is unsigned, the fixed-point range is
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
-//
-// From this we compute our scaling factor, s:
-// ```c++
-//   s = (max_fixed - min_fixed) / (2 * m)
-// ```
-//
-// Now we can quantize the elements of our tensor:
-// ```c++
-// result = round(input * s)
-// ```
-//
-// One thing to watch out for is that the operator may choose to adjust the
-// requested minimum and maximum values slightly during the quantization process,
-// so you should always use the output ports as the range for further calculations.
-// For example, if the requested minimum and maximum values are close to equal,
-// they will be separated by a small epsilon value to prevent ill-formed quantized
-// buffers from being created. Otherwise, you can end up with buffers where all the
-// quantized values map to the same float value, which causes problems for
-// operations that have to perform further calculations on them.
-//
-// Arguments:
-//
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
+// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Computes size of weights that can be used by a Cudnn RNN model.
 //
+// Return the params size that can be used by the Cudnn RNN model. Subsequent
+// weight allocation and initialization should use this size.
 //
-// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
-func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   The actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//   dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+// params_size: The size of the params buffer that should be allocated and
+//   initialized for this RNN model. Note that this params buffer may not be
+//   compatible across GPUs. Please use CudnnRNNParamsWeights and
+//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
+//   across different runs.
+func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{"T": T, "S": S}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeV2",
+		Type: "CudnnRNNParamsSize",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			num_layers, num_units, input_size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Returns the truth value of (x < y) element-wise.
+// Computes gradients for SparseSegmentMean.
 //
-// *NOTE*: `Less` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
+//
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentMean op.
+//	indices: indices passed to the corresponding SparseSegmentMean op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
+func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Less",
+		Type: "SparseSegmentMeanGrad",
 		Input: []tf.Input{
-			x, y,
+			grad, indices, segment_ids, output_dim0,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedReluXAttr is an optional argument to QuantizedReluX.
-type QuantizedReluXAttr func(optionalAttr)
-
-// QuantizedReluXOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
-//
-// Arguments:
+// Returns the set of files matching one or more glob patterns.
 //
+// Note that this routine only supports wildcard characters in the
+// basename portion of the pattern, not in the directory portion.
+// Note also that the order of filenames returned can be non-deterministic.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// Arguments:
+//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Returns A vector of matching filenames.
+func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReluX",
+		Type: "MatchingFiles",
 		Input: []tf.Input{
-			features, max_value, min_features, max_features,
+			pattern,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
+// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
+type HistogramFixedWidthAttr func(optionalAttr)
 
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+// HistogramFixedWidthDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT32
+func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["dtype"] = value
 	}
 }
 
-// QuantizedConv2DDilations sets the optional dilations attribute to value.
+// Return histogram of values.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 2D convolution given quantized 4D input and filter tensors.
+// Given the tensor `values`, this operation returns a rank 1 histogram counting
+// the number of entries in `values` that fall into every bin.  The bins are
+// equal width and determined by the arguments `value_range` and `nbins`.
 //
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
+// ```python
+// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+// nbins = 5
+// value_range = [0.0, 5.0]
+// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
 //
-// Arguments:
+// with tf.get_default_session() as sess:
+//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
+//   variables.global_variables_initializer().run()
+//   sess.run(hist) => [2, 1, 1, 0, 2]
+// ```
 //
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.
-//	padding: The type of padding algorithm to use.
+// Arguments:
+//	values: Numeric `Tensor`.
+//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
+// values <= value_range[0] will be mapped to hist[0],
+// values >= value_range[1] will be mapped to hist[-1].
+//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Returns A 1-D `Tensor` holding histogram of values.
+func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
+		Type: "HistogramFixedWidth",
 		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
+			values, value_range, nbins,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
-type StatelessMultinomialAttr func(optionalAttr)
-
-// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
-	}
+	return op.Output(0)
 }
 
-// Draws samples from a multinomial distribution.
-//
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
-//	seed: 2 seeds (shape [2]).
+// Returns the truth value of (x >= y) element-wise.
 //
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
+// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StatelessMultinomial",
+		Type: "GreaterEqual",
 		Input: []tf.Input{
-			logits, num_samples, seed,
+			x, y,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv3DAttr is an optional argument to Conv3D.
+type Conv3DAttr func(optionalAttr)
+
+// Conv3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DDataFormat(value string) Conv3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
 }
 
-// ResourceGatherAttr is an optional argument to ResourceGather.
-type ResourceGatherAttr func(optionalAttr)
-
-// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+// Conv3DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["dilations"] = value
 	}
 }
 
-// Gather slices from the variable pointed to by `resource` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
 //
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
+// In signal processing, cross-correlation is a measure of similarity of
+// two waveforms as a function of a time-lag applied to one of them. This
+// is also known as a sliding dot product or sliding inner-product.
 //
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
+// Our Conv3D implements a form of cross-correlation.
 //
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
-func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
+// Arguments:
+//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
+//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
+// out_channels]`. `in_channels` must match between `input` and `filter`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceGather",
+		Type: "Conv3D",
 		Input: []tf.Input{
-			resource, indices,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
@@ -16539,237 +16877,346 @@ func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype t
 	return op.Output(0)
 }
 
-// Delete the TensorArray from its resource container.
+// Adds up a SparseTensor and a dense Tensor, using these special rules:
 //
-// This enables the user to close and release the resource in the middle
-// of a step/run.
+// (1) Broadcasts the dense side to have the same shape as the sparse side, if
+//     eligible;
+// (2) Then, only the dense values pointed to by the indices of the SparseTensor
+//     participate in the cwise addition.
+//
+// By these rules, the result is a logical SparseTensor with exactly the same
+// indices and shape, but possibly with different non-zero values.  The output of
+// this Op is the resultant non-zero values.
 //
 // Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-// Returns the created operation.
-func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV3",
+		Type: "SparseDenseCwiseAdd",
 		Input: []tf.Input{
-			handle,
+			sp_indices, sp_values, sp_shape, dense,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Adds two `SparseTensor` objects to produce another `SparseTensor`.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in standard
-// lexicographic order.  If this is not the case, before this step run
-// `SparseReorder` to restore index ordering.
+// Read an element from the TensorArray into output `value`.
 //
-// By default, if two values sum to zero at some index, the output `SparseTensor`
-// would still include that particular location in its index, storing a zero in the
-// corresponding value slot.  To override this, callers can specify `thresh`,
-// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
-// corresponding value and index would then not be included.  In particular,
-// `thresh == 0` (default) means everything is kept and actual thresholding happens
-// only for a positive value.
+// Arguments:
+//	handle: The handle to a TensorArray.
 //
-// In the following shapes, `nnz` is the count after taking `thresh` into account.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
 //
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
-//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
-//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
-//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
-// pair takes space.
-func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
+// Returns The tensor that is read from the TensorArray.
+func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "SparseAdd",
+		Type: "TensorArrayReadV3",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
+			handle, index, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
-type OrderedMapPeekAttr func(optionalAttr)
-
-// OrderedMapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
+	return op.Output(0)
 }
 
-// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
+// QuantizeV2Attr is an optional argument to QuantizeV2.
+type QuantizeV2Attr func(optionalAttr)
 
-// OrderedMapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
+// QuantizeV2Mode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func QuantizeV2Mode(value string) QuantizeV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["mode"] = value
 	}
 }
 
-// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
+// QuantizeV2RoundMode sets the optional round_mode attribute to value.
+// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
+func QuantizeV2RoundMode(value string) QuantizeV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["round_mode"] = value
 	}
 }
 
-// Op peeks at the values at the specified key.  If the
+// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
+//
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.  The
+// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
+// when rounding float values to their quantized equivalents.
+//
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+//
+// ```
+// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+// if T == qint8, out[i] -= (range(T) + 1) / 2.0
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+//
+// *MIN_COMBINED Mode Example*
+//
+// Assume the input is type float and has a possible range of [0.0, 6.0] and the
+// output type is quint8 ([0, 255]). The min_range and max_range values should be
+// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+// value of the input by 255/6 and cast to quint8.
+//
+// If the output type was qint8 ([-128, 127]), the operation will additionally
+// subtract each value by 128 prior to casting, so that the range of values aligns
+// with the range of qint8.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = num_discrete_values / range
+// quantized = round(input * range_scale) - round(range_min * range_scale) +
+//   numeric_limits<T>::min()
+// quantized = max(quantized, numeric_limits<T>::min())
+// quantized = min(quantized, numeric_limits<T>::max())
+// ```
+//
+// The biggest difference between this and MIN_COMBINED is that the minimum range
+// is rounded first, before it's subtracted from the rounded value. With
+// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+// and dequantizing will introduce a larger and larger error.
+//
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
+//
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
+//
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
+//
+// Otherwise, if T is unsigned, the fixed-point range is
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
+//
+// From this we compute our scaling factor, s:
+// ```c++
+//   s = (max_fixed - min_fixed) / (2 * m)
+// ```
+//
+// Now we can quantize the elements of our tensor:
+// ```c++
+// result = round(input * s)
+// ```
+//
+// One thing to watch out for is that the operator may choose to adjust the
+// requested minimum and maximum values slightly during the quantization process,
+// so you should always use the output ports as the range for further calculations.
+// For example, if the requested minimum and maximum values are close to equal,
+// they will be separated by a small epsilon value to prevent ill-formed quantized
+// buffers from being created. Otherwise, you can end up with buffers where all the
+// quantized values map to the same float value, which causes problems for
+// operations that have to perform further calculations on them.
+//
+// Arguments:
 //
-// underlying container does not contain this key
-// this op will block until it does.   This Op is optimized for
-// performance.
-func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+//
+//
+// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
+func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"T": T}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapPeek",
+		Type: "QuantizeV2",
 		Input: []tf.Input{
-			key, indices,
+			input, min_range, max_range,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the truth value of (x < y) element-wise.
+//
+// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapPeek", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "Less",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
-	return values
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
-type DecodeAndCropJpegAttr func(optionalAttr)
+// QuantizedReluXAttr is an optional argument to QuantizedReluX.
+type QuantizedReluXAttr func(optionalAttr)
 
-// DecodeAndCropJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
+// QuantizedReluXOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
 	return func(m optionalAttr) {
-		m["channels"] = value
+		m["out_type"] = value
 	}
 }
 
-// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
+// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
 //
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
+// Arguments:
+//
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedReluX",
+		Input: []tf.Input{
+			features, max_value, min_features, max_features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
-//
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
+
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
+		m["out_type"] = value
 	}
 }
 
-// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+// QuantizedConv2DDilations sets the optional dilations attribute to value.
 //
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
+		m["dilations"] = value
 	}
 }
 
-// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+// Computes a 2D convolution given quantized 4D input and filter tensors.
 //
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
+//
+// Arguments:
+//
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConv2D",
+		Input: []tf.Input{
+			input, filter, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
+type StatelessMultinomialAttr func(optionalAttr)
+
+// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
 	return func(m optionalAttr) {
-		m["dct_method"] = value
+		m["output_dtype"] = value
 	}
 }
 
-// Decode and Crop a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
-//
-//
-// It is equivalent to a combination of decode and crop, but much faster by only
-// decoding partial jpeg image.
+// Draws samples from a multinomial distribution.
 //
 // Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16778,9 +17225,9 @@ func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeAndCropJpeg",
+		Type: "StatelessMultinomial",
 		Input: []tf.Input{
-			contents, crop_window,
+			logits, num_samples, seed,
 		},
 		Attrs: attrs,
 	}
@@ -16788,76 +17235,71 @@ func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output,
 	return op.Output(0)
 }
 
-// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
-type AllCandidateSamplerAttr func(optionalAttr)
-
-// AllCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// ResourceGatherAttr is an optional argument to ResourceGather.
+type ResourceGatherAttr func(optionalAttr)
 
-// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
+// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// Gather slices from the variable pointed to by `resource` according to `indices`.
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
 //
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to produce.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AllCandidateSampler",
+		Type: "ResourceGather",
 		Input: []tf.Input{
-			true_classes,
+			resource, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
+}
+
+// Delete the TensorArray from its resource container.
+//
+// This enables the user to close and release the resource in the middle
+// of a step/run.
+//
+// Arguments:
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//
+// Returns the created operation.
+func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayCloseV3",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	return scope.AddOperation(opspec)
 }
 
 // Saves the input tensors to disk.
@@ -18964,242 +19406,32 @@ func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
 
 // Outputs random integers from a uniform distribution.
 //
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
-//
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	minval: 0-D.  Inclusive lower bound on the generated integers.
-//	maxval: 0-D.  Exclusive upper bound on the generated integers.
-//
-// Returns A tensor of the specified shape filled with uniform random integers.
-func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomUniformInt",
-		Input: []tf.Input{
-			shape, minval, maxval,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SkipgramAttr is an optional argument to Skipgram.
-type SkipgramAttr func(optionalAttr)
-
-// SkipgramWindowSize sets the optional window_size attribute to value.
-//
-// value: The number of words to predict to the left and right of the target.
-// If not specified, defaults to 5
-func SkipgramWindowSize(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["window_size"] = value
-	}
-}
-
-// SkipgramMinCount sets the optional min_count attribute to value.
-//
-// value: The minimum number of word occurrences for it to be included in the
-// vocabulary.
-// If not specified, defaults to 5
-func SkipgramMinCount(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["min_count"] = value
-	}
-}
-
-// SkipgramSubsample sets the optional subsample attribute to value.
-//
-// value: Threshold for word occurrence. Words that appear with higher
-// frequency will be randomly down-sampled. Set to 0 to disable.
-// If not specified, defaults to 0.001
-func SkipgramSubsample(value float32) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["subsample"] = value
-	}
-}
-
-// Parses a text file and creates a batch of examples.
-//
-// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
-//
-// Arguments:
-//	filename: The corpus's text file name.
-//	batch_size: The size of produced batch.
-//
-// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
-func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Skipgram",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
-}
-
-// StringToNumberAttr is an optional argument to StringToNumber.
-type StringToNumberAttr func(optionalAttr)
-
-// StringToNumberOutType sets the optional out_type attribute to value.
-//
-// value: The numeric type to interpret each string in `string_tensor` as.
-// If not specified, defaults to DT_FLOAT
-func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Converts each string in the input Tensor to the specified numeric type.
-//
-// (Note that int32 overflow results in an error while float overflow
-// results in a rounded value.)
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringToNumber",
-		Input: []tf.Input{
-			string_tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
-type ResourceApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the Ftrl-proximal scheme.
-//
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrlV2",
-		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// TruncatedNormalAttr is an optional argument to TruncatedNormal.
-type TruncatedNormalAttr func(optionalAttr)
-
-// TruncatedNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
+//
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
 //
 // Arguments:
 //	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	minval: 0-D.  Inclusive lower bound on the generated integers.
+//	maxval: 0-D.  Exclusive upper bound on the generated integers.
 //
-// Returns A tensor of the specified shape filled with random truncated normal
-// values.
-func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
+// Returns A tensor of the specified shape filled with uniform random integers.
+func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TruncatedNormal",
+		Type: "RandomUniformInt",
 		Input: []tf.Input{
-			shape,
+			shape, minval, maxval,
 		},
 		Attrs: attrs,
 	}
@@ -19325,113 +19557,6 @@ func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...Or
 	return op.Output(0)
 }
 
-// DecodeRawAttr is an optional argument to DecodeRaw.
-type DecodeRawAttr func(optionalAttr)
-
-// DecodeRawLittleEndian sets the optional little_endian attribute to value.
-//
-// value: Whether the input `bytes` are in little-endian order.
-// Ignored for `out_type` values that are stored in a single byte like
-// `uint8`.
-// If not specified, defaults to true
-func DecodeRawLittleEndian(value bool) DecodeRawAttr {
-	return func(m optionalAttr) {
-		m["little_endian"] = value
-	}
-}
-
-// Reinterpret the bytes of a string as a vector of numbers.
-//
-// Arguments:
-//	bytes: All the elements must have the same length.
-//
-//
-// Returns A Tensor with one more dimension than the input `bytes`.  The
-// added dimension will have size equal to the length of the elements
-// of `bytes` divided by the number of bytes to represent `out_type`.
-func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeRaw",
-		Input: []tf.Input{
-			bytes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Copy a tensor setting everything outside a central band in each innermost matrix
-//
-// to zero.
-//
-// The `band` part is computed as follows:
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor with the same shape where
-//
-// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
-//
-// The indicator function
-//
-// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-//                  (num_upper < 0 || (n-m) <= num_upper)`.
-//
-// For example:
-//
-// ```
-// # if 'input' is [[ 0,  1,  2, 3]
-//                  [-1,  0,  1, 2]
-//                  [-2, -1,  0, 1]
-//                  [-3, -2, -1, 0]],
-//
-// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-//                                        [-1,  0,  1, 2]
-//                                        [ 0, -1,  0, 1]
-//                                        [ 0,  0, -1, 0]],
-//
-// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-//                                       [-1,  0,  1, 0]
-//                                       [-2, -1,  0, 1]
-//                                       [ 0, -2, -1, 0]]
-// ```
-//
-// Useful special cases:
-//
-// ```
-//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
-//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
-//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-// ```
-//
-// Arguments:
-//	input: Rank `k` tensor.
-//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-// lower triangle.
-//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
-// entire upper triangle.
-//
-// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
-func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixBandPart",
-		Input: []tf.Input{
-			input, num_lower, num_upper,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Counts the number of occurrences of each value in an integer array.
 //
 // Outputs a vector with length `size` and the same dtype as `weights`. If
@@ -21159,7 +21284,7 @@ func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 //    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
 //
 // The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
+// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
 // Each element must be in the range `[0, 255]` (It represents the value of a
 // pixel in the output image).  Non-finite values in the input tensor are
 // replaced by this tensor in the output image.  The default value is the color
@@ -30569,128 +30694,3 @@ func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
-
-// Gather slices from `params` into a Tensor with shape specified by `indices`.
-//
-// `indices` is an K-dimensional integer tensor, best thought of as a
-// (K-1)-dimensional tensor of indices into `params`, where each element defines a
-// slice of `params`:
-//
-//     output[i_0, ..., i_{K-2}] = params[indices[i0, ..., i_{K-2}]]
-//
-// Whereas in @{tf.gather} `indices` defines slices into the first
-// dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
-// first `N` dimensions of `params`, where `N = indices.shape[-1]`.
-//
-// The last dimension of `indices` can be at most the rank of
-// `params`:
-//
-//     indices.shape[-1] <= params.rank
-//
-// The last dimension of `indices` corresponds to elements
-// (if `indices.shape[-1] == params.rank`) or slices
-// (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
-// of `params`.  The output tensor has shape
-//
-//     indices.shape[:-1] + params.shape[indices.shape[-1]:]
-//
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, a 0 is stored in the
-// corresponding output value.
-//
-// Some examples below.
-//
-// Simple indexing into a matrix:
-//
-// ```python
-//     indices = [[0, 0], [1, 1]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = ['a', 'd']
-// ```
-//
-// Slice indexing into a matrix:
-//
-// ```python
-//     indices = [[1], [0]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [['c', 'd'], ['a', 'b']]
-// ```
-//
-// Indexing into a 3-tensor:
-//
-// ```python
-//     indices = [[1]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[['a1', 'b1'], ['c1', 'd1']]]
-//
-//
-//     indices = [[0, 1], [1, 0]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [['c0', 'd0'], ['a1', 'b1']]
-//
-//
-//     indices = [[0, 0, 1], [1, 0, 1]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = ['b0', 'b1']
-// ```
-//
-// Batched indexing into a matrix:
-//
-// ```python
-//     indices = [[[0, 0]], [[0, 1]]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [['a'], ['b']]
-// ```
-//
-// Batched slice indexing into a matrix:
-//
-// ```python
-//     indices = [[[1]], [[0]]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [[['c', 'd']], [['a', 'b']]]
-// ```
-//
-// Batched indexing into a 3-tensor:
-//
-// ```python
-//     indices = [[[1]], [[0]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[[['a1', 'b1'], ['c1', 'd1']]],
-//               [[['a0', 'b0'], ['c0', 'd0']]]]
-//
-//     indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[['c0', 'd0'], ['a1', 'b1']],
-//               [['a0', 'b0'], ['c1', 'd1']]]
-//
-//
-//     indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [['b0', 'b1'], ['d0', 'c1']]
-// ```
-//
-// Arguments:
-//	params: The tensor from which to gather values.
-//	indices: Index tensor.
-//
-// Returns Values from `params` gathered from indices given by `indices`, with
-// shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
-func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GatherNd",
-		Input: []tf.Input{
-			params, indices,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-- 
GitLab


From 8fa27b1903ceedb25da5649aa17160866dda734d Mon Sep 17 00:00:00 2001
From: Billy Lamberta <blamb@google.com>
Date: Mon, 23 Apr 2018 22:08:52 -0700
Subject: [PATCH 344/450] docs: Clean up install_linux with pip

---
 tensorflow/docs_src/install/install_linux.md | 342 ++++++++-----------
 1 file changed, 151 insertions(+), 191 deletions(-)

diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index b7b0fc7d3d..9b431e49ee 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -103,248 +103,203 @@ the specified versions. If upgrading is not possible, then you may still run
 TensorFlow with GPU support, if you @{$install_sources$install TensorFlow from Sources}.
 
 
-## Determine how to install TensorFlow
-
-You must pick the mechanism by which you install TensorFlow. The
-supported choices are as follows:
-
-  * [Virtualenv](#InstallingVirtualenv)
-  * ["native" pip](#InstallingNativePip)
-  * [Docker](#InstallingDocker)
-  * [Anaconda](#InstallingAnaconda)
-  * installing from sources, which is documented in
-    [a separate guide](https://www.tensorflow.org/install/install_sources).
-
-**We recommend the Virtualenv installation.**
-[Virtualenv](https://virtualenv.pypa.io/en/stable/)
-is a virtual Python environment isolated from other Python development,
-incapable of interfering with or being affected by other Python programs
-on the same machine.  During the Virtualenv installation process,
-you will install not only TensorFlow but also all the packages that
-TensorFlow requires.  (This is actually pretty easy.)
-To start working with TensorFlow, you simply need to "activate" the
-virtual environment.  All in all, Virtualenv provides a safe and
-reliable mechanism for installing and running TensorFlow.
-
-Native pip installs TensorFlow directly on your system without going
-through any container system. **We recommend the native pip install for
-system administrators aiming to make TensorFlow available to everyone on a
-multi-user system.** Since a native pip installation is not walled-off in
-a separate container, the pip installation might interfere with other
-Python-based installations on your system. However, if you understand pip
-and your Python environment, a native pip installation often entails only
-a single command.
+## How to install TensorFlow
 
-Docker completely isolates the TensorFlow installation
-from pre-existing packages on your machine. The Docker container contains
-TensorFlow and all its dependencies. Note that the Docker image can be quite
-large (hundreds of MBs). You might choose the Docker installation if you are
-incorporating TensorFlow into a larger application architecture that already
-uses Docker.
-
-In Anaconda, you may use conda to create a virtual environment.
-However, within Anaconda, we recommend installing TensorFlow with the
-`pip install` command, not with the `conda install` command.
-
-**NOTE:** The conda package is community supported, not officially supported.
-That is, the TensorFlow team neither tests nor maintains the conda package.
-Use that package at your own risk.
+There are a few options to install TensorFlow on your machine:
 
+* [Use pip in a virtual environment](#InstallingVirtualenv) *(recommended)*
+* [Use pip in your system environment](#InstallingNativePip)
+* [Configure a Docker container](#InstallingDocker)
+* [Use pip in Anaconda](#InstallingAnaconda)
+* [Install TensorFlow from source](/install/install_sources)
 
 <a name="InstallingVirtualenv"></a>
-## Installing with Virtualenv
-
-Take the following steps to install TensorFlow with Virtualenv:
-
-  1. Install pip and Virtualenv by issuing one of the following commands:
-
-     <pre>$ <b>sudo apt-get install python-pip python-dev python-virtualenv</b> # for Python 2.7
-    $ <b>sudo apt-get install python3-pip python3-dev python-virtualenv</b> # for Python 3.n</pre>
-
-  2. Create a Virtualenv environment by issuing one of the following commands:
+### Use `pip` in a virtual environment
 
-     <pre>$ <b>virtualenv --system-site-packages</b> <i>targetDirectory</i> # for Python 2.7
-    $ <b>virtualenv --system-site-packages -p python3</b> <i>targetDirectory</i> # for Python 3.n</pre>
+This is the *recommended* install method. The
+[Virtualenv](https://virtualenv.pypa.io/en/stable/) tool creates virtual Python
+environments that are isolated from other Python development on the same machine.
+In this scenario, you install TensorFlow and its dependencies within a virtual
+environment that is available when *activated*. Virtualenv provides a reliable
+way to install and run TensorFlow while avoiding conflicts with the rest of the
+system.
 
-     where <code><em>targetDirectory</em></code> specifies the top of the
-     Virtualenv tree.  Our instructions assume that
-     <code><em>targetDirectory</em></code> is `~/tensorflow`, but you may
-     choose any directory.
+1\. On Ubuntu, install the `pip` and `virtualenv` packages:
 
-  3. Activate the Virtualenv environment by issuing one of the following
-     commands:
-
-     <pre>$ <b>source ~/tensorflow/bin/activate</b> # bash, sh, ksh, or zsh
-    $ <b>source ~/tensorflow/bin/activate.csh</b>  # csh or tcsh
-    $ <b>. ~/tensorflow/bin/activate.fish</b>  # fish</pre>
-
-     The preceding <tt>source</tt> command should change your prompt
-     to the following:
-
-     <pre>(tensorflow)$ </pre>
-
-  4. Ensure pip ≥8.1 is installed:
-
-     <pre>(tensorflow)$ <b>easy_install -U pip</b></pre>
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo apt-get install python-pip python-dev python-virtualenv   # for Python 2.7</code>
+  <code class="devsite-terminal">sudo apt-get install python3-pip python3-dev python-virtualenv # for Python 3.n</code>
+</pre>
 
-  5. Issue one of the following commands to install TensorFlow in the active
-     Virtualenv environment:
+2\. Create a directory for the virtual environment and choose a Python
+interpreter:
 
-     <pre>(tensorflow)$ <b>pip install --upgrade tensorflow</b>      # for Python 2.7
-    (tensorflow)$ <b>pip3 install --upgrade tensorflow</b>     # for Python 3.n
-    (tensorflow)$ <b>pip install --upgrade tensorflow-gpu</b>  # for Python 2.7 and GPU
-    (tensorflow)$ <b>pip3 install --upgrade tensorflow-gpu</b> # for Python 3.n and GPU</pre>
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">mkdir ~/tensorflow  # somewhere to work out of</code>
+  <code class="devsite-terminal">cd ~/tensorflow</code>
+  <code># Choose one of the following Python environments for the ./venv directory:</code>
+  <code class="devsite-terminal">virtualenv --system-site-packages <var>venv</var>            # Use python default (Python 2.7)</code>
+  <code class="devsite-terminal">virtualenv --system-site-packages -p python3 <var>venv</var> # Use Python 3.n</code>
+</pre>
 
-     If the above command succeeds, skip Step 6. If the preceding
-     command fails, perform Step 6.
+3\. Activate the Virtualenv environment using one of these shell commands:
 
-  6. (Optional) If Step 5 failed (typically because you invoked a pip version
-     lower than 8.1), install TensorFlow in the active Virtualenv environment
-     by issuing a command of the following format:
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">source ~/tensorflow/<var>venv</var>/bin/activate      # bash, sh, ksh, or zsh</code>
+  <code class="devsite-terminal">source ~/tensorflow/<var>venv</var>/bin/activate.csh  # csh or tcsh</code>
+  <code class="devsite-terminal">. ~/tensorflow/<var>venv</var>/bin/activate.fish      # fish</code>
+</pre>
 
-     <pre>(tensorflow)$ <b>pip install --upgrade</b> <i>tfBinaryURL</i>   # Python 2.7
-    (tensorflow)$ <b>pip3 install --upgrade</b> <i>tfBinaryURL</i>  # Python 3.n </pre>
+When the Virtualenv is activated, the shell prompt displays as `(venv) $`.
 
-     where <code><em>tfBinaryURL</em></code> identifies the URL of the
-     TensorFlow Python package. The appropriate value of
-     <code><em>tfBinaryURL</em></code>depends on the operating system,
-     Python version, and GPU support. Find the appropriate value for
-     <code><em>tfBinaryURL</em></code> for your system
-     [here](#the_url_of_the_tensorflow_python_package).  For example, if you
-     are installing TensorFlow for Linux, Python 3.4, and CPU-only support,
-     issue the following command to install TensorFlow in the active
-     Virtualenv environment:
+4\. Upgrade `pip` in your virtual environment:
 
-     <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+See the [pip installation guide](https://pip.pypa.io/en/stable/installing/) for
+instructions, or use `easy_install`:
 
-If you encounter installation problems, see
-[Common Installation Problems](#common_installation_problems).
+<pre class="prettyprint lang-bsh">
+(venv)$ easy_install -U pip
+</pre>
 
+5\. Within an active Virtualenv environment, use one of the following `pip`
+commands to install the TensorFlow package:
 
-### Next Steps
+<pre class="prettyprint lang-bsh">
+(venv)$ pip install --upgrade tensorflow      # for Python 2.7</code>
+(venv)$ pip3 install --upgrade tensorflow     # for Python 3.n</code>
+(venv)$ pip install --upgrade tensorflow-gpu  # for Python 2.7 and GPU</code>
+</pre>
 
-After installing TensorFlow,
-[validate the installation](#ValidateYourInstallation).
+Success! TensorFlow is now installed.
 
-Note that you must activate the Virtualenv environment each time you
-use TensorFlow. If the Virtualenv environment is not currently active,
-invoke one of the following commands:
+Use `pip list` to show the packages installed in the virtual environment.
+[Validate the install](#ValidateYourInstallation) and test the version:
 
-<pre>$ <b>source ~/tensorflow/bin/activate</b>      # bash, sh, ksh, or zsh
-$ <b>source ~/tensorflow/bin/activate.csh</b>  # csh or tcsh</pre>
+<pre class="prettyprint lang-bsh">
+(venv)$ python -c "import tensorflow as tf; print(tf.__version__)"
+</pre>
 
-When the Virtualenv environment is active, you may run
-TensorFlow programs from this shell.  Your prompt will become
-the following to indicate that your tensorflow environment is active:
+Use the `deactivate` command to stop the Python virtual environment.
 
-<pre>(tensorflow)$ </pre>
+#### Problems
 
-When you are done using TensorFlow, you may deactivate the
-environment by invoking the `deactivate` function as follows:
+If the above steps failed, try installing the TensorFlow binary using the remote
+URL of the `pip` package:
 
-<pre>(tensorflow)$ <b>deactivate</b> </pre>
+<pre class="prettyprint lang-bsh">
+(venv)$ pip install --upgrade <var>remote-pkg-URL</var>   # Python 2.7
+(venv)$ pip3 install --upgrade <var>remote-pkg-URL</var>  # Python 3.n
+</pre>
 
-The prompt will revert back to your default prompt (as defined by the
-`PS1` environment variable).
+The <var>remote-pkg-URL</var> depends on the operating system, Python version,
+and GPU support. See [here](#the_url_of_the_tensorflow_python_package) for the
+URL naming scheme and location.
 
+See [Common Installation Problems](#common_installation_problems) if you
+encounter problems.
 
-### Uninstalling TensorFlow
+#### Uninstall TensorFlow
 
-To uninstall TensorFlow, simply remove the tree you created.
-For example:
+To uninstall TensorFlow, remove the Virtualenv directory you created in step 2:
 
-<pre>$ <b>rm -r</b> <i>targetDirectory</i> </pre>
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">deactivate  # stop the virtualenv</code>
+  <code class="devsite-terminal">rm -r ~/tensorflow/<var>venv</var></code>
+</pre>
 
 
 <a name="InstallingNativePip"></a>
-## Installing with native pip
-
-You may install TensorFlow through pip, choosing between a simple
-installation procedure or a more complex one.
+### Use `pip` in your system environment
 
-**Note:** The
-[REQUIRED_PACKAGES section of setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py)
-lists the TensorFlow packages that pip will install or upgrade.
+Use `pip` to install the TensorFlow package directly on your system without
+using a container or virtual environment for isolation. This method is
+recommended for system administrators that want a TensorFlow installation that is
+available to everyone on a multi-user system.
 
+Since a system install is not isolated, it could interfere with other
+Python-based installations. But if you understand `pip` and your Python
+environment, a system `pip` install is straightforward.
 
-### Prerequisite: Python and Pip
+See the
+[REQUIRED_PACKAGES section of setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py)
+for a list of TensorFlow packages that `pip` installs or upgrade`.
 
-Python is automatically installed on Ubuntu.  Take a moment to confirm
-(by issuing a `python -V` command) that one of the following Python
-versions is already installed on your system:
 
-  * Python 2.7
-  * Python 3.4+
+#### Install Python and `pip`
 
-The pip or pip3 package manager is *usually* installed on Ubuntu.  Take a
-moment to confirm (by issuing a `pip -V` or `pip3 -V` command)
-that pip or pip3 is installed.  We strongly recommend version 8.1 or higher
-of pip or pip3.  If Version 8.1 or later is not installed, issue the
-following command, which will either install or upgrade to the latest
-pip version:
+On Ubuntu, Python is automatically installed and `pip` is *usually* installed.
+Confirm the `python` and `pip` versions:
 
-<pre>$ <b>sudo apt-get install python-pip python-dev</b>   # for Python 2.7
-$ <b>sudo apt-get install python3-pip python3-dev</b> # for Python 3.n
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">python -V</code>
+  <code class="devsite-terminal">pip -V  # or: pip3 -V</code>
 </pre>
 
+We *strongly recommend* `pip` or `pip3` version 8.1 or higher. If using a release
+before version 8.1,  upgrade `pip`:
 
-### Install TensorFlow
-
-Assuming the prerequisite software is installed on your Linux host,
-take the following steps:
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo apt-get install python-pip python-dev   # for Python 2.7</code>
+  <code class="devsite-terminal">sudo apt-get install python3-pip python3-dev # for Python 3.n</code>
+</pre>
 
-  1. Install TensorFlow by invoking **one** of the following commands:
 
-     <pre>$ <b>pip install tensorflow</b>      # Python 2.7; CPU support (no GPU support)
-    $ <b>pip3 install tensorflow</b>     # Python 3.n; CPU support (no GPU support)
-    $ <b>pip install tensorflow-gpu</b>  # Python 2.7;  GPU support
-    $ <b>pip3 install tensorflow-gpu</b> # Python 3.n; GPU support </pre>
+#### Install TensorFlow
 
-     If the preceding command runs to completion, you should now
-     [validate your installation](#ValidateYourInstallation).
+Install one of the available TensorFlow packages:
 
-  2. (Optional.) If Step 1 failed, install the latest version of TensorFlow
-     by issuing a command of the following format:
+<pre class="prettyprint lang-bsh">
+  <code># Select one:</code>
+  <code class="devsite-terminal">sudo pip install tensorflow      # Python 2.7 CPU (no GPU support)</code>
+  <code class="devsite-terminal">sudo pip3 install tensorflow     # Python 3.n CPU (no GPU support)</code>
+  <code class="devsite-terminal">sudo pip install tensorflow-gpu  # Python 2.7 GPU support</code>
+  <code class="devsite-terminal">sudo pip3 install tensorflow-gpu # Python 3.n GPU support</code>
+</pre>
 
-     <pre>$ <b>sudo pip  install --upgrade</b> <i>tfBinaryURL</i>   # Python 2.7
-    $ <b>sudo pip3 install --upgrade</b> <i>tfBinaryURL</i>   # Python 3.n </pre>
+Success! TensorFlow is now installed.
 
-     where <code><em>tfBinaryURL</em></code> identifies the URL of the
-     TensorFlow Python package. The appropriate value of
-     <code><em>tfBinaryURL</em></code> depends on the operating system,
-     Python version, and GPU support. Find the appropriate value for
-     <code><em>tfBinaryURL</em></code>
-     [here](#the_url_of_the_tensorflow_python_package).  For example, to
-     install TensorFlow for Linux, Python 3.4, and CPU-only support, issue
-     the following command:
+Use `pip list` to show the packages installed on the system.
+[Validate the install](#ValidateYourInstallation) and test the version:
 
-     <pre>
-     $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl</b>
-     </pre>
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">python -c "import tensorflow as tf; print(tf.__version__)"</code>
+</pre>
 
-     If this step fails, see
-     [Common Installation Problems](#common_installation_problems).
+#### Problems
 
+If the above steps failed, try installing the TensorFlow binary using the remote
+URL of the `pip` package:
 
-### Next Steps
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo pip install --upgrade <var>remote-pkg-URL</var>   # Python 2.7</code>
+  <code class="devsite-terminal">sudo pip3 install --upgrade <var>remote-pkg-URL</var>  # Python 3.n</code>
+</pre>
 
-After installing TensorFlow, [validate your installation](#ValidateYourInstallation).
+The <var>remote-pkg-URL</var> depends on the operating system, Python version,
+and GPU support. See [here](#the_url_of_the_tensorflow_python_package) for the
+URL naming scheme and location.
 
+See [Common Installation Problems](#common_installation_problems) if you
+encounter problems.
 
-### Uninstalling TensorFlow
+#### Uninstall TensorFlow
 
-To uninstall TensorFlow, issue one of following commands:
+To uninstall TensorFlow on your system, use one of following commands:
 
-<pre>
-$ <b>sudo pip uninstall tensorflow</b>  # for Python 2.7
-$ <b>sudo pip3 uninstall tensorflow</b> # for Python 3.n
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo pip uninstall tensorflow   # for Python 2.7</code>
+  <code class="devsite-terminal">sudo pip3 uninstall tensorflow  # for Python 3.n</code>
 </pre>
 
-
 <a name="InstallingDocker"></a>
-## Installing with Docker
+### Configure a Docker container
+
+Docker completely isolates the TensorFlow installation
+from pre-existing packages on your machine. The Docker container contains
+TensorFlow and all its dependencies. Note that the Docker image can be quite
+large (hundreds of MBs). You might choose the Docker installation if you are
+incorporating TensorFlow into a larger application architecture that already
+uses Docker.
 
 Take the following steps to install TensorFlow through Docker:
 
@@ -364,7 +319,7 @@ Take the following steps to install TensorFlow through Docker:
 The remainder of this section explains how to launch a Docker container.
 
 
-### CPU-only
+#### CPU-only
 
 To launch a Docker container with CPU-only support (that is, without
 GPU support), enter a command of the following format:
@@ -414,7 +369,7 @@ $ <b>docker run -it -p 8888:8888 tensorflow/tensorflow</b>
 Docker will download the TensorFlow binary image the first time you launch it.
 
 
-### GPU support
+#### GPU support
 
 Prior to installing TensorFlow with GPU support, ensure that your system meets all
 [NVIDIA software requirements](#NVIDIARequirements).  To launch a Docker container
@@ -470,14 +425,22 @@ For more details see the
 [TensorFlow docker readme](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/docker).
 
 
-### Next Steps
+#### Next Steps
 
 You should now
 [validate your installation](#ValidateYourInstallation).
 
 
 <a name="InstallingAnaconda"></a>
-## Installing with Anaconda
+### Use `pip` in Anaconda
+
+Anaconda provides the `conda` utility to create a virtual environment. However,
+within Anaconda, we recommend installing TensorFlow using the `pip install`
+command and *not* with the `conda install` command.
+
+Caution: `conda` is a community supported package this is not officially
+maintained by the TensorFlow team. Use this package at your own risk since it is
+not tested on new TensorFlow releases.
 
 Take the following steps to install TensorFlow in an Anaconda environment:
 
@@ -563,10 +526,7 @@ installation problems](#common_installation_problems).
 If you are new to machine learning, we recommend the following:
 
 *  [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course)
-*  @{$get_started/get_started_for_beginners$Getting Started for ML Beginners}
-
-If you are experienced with machine learning but new to TensorFlow, see
-@{$get_started/premade_estimators$Getting Started with TensorFlow}.
+*  @{$get_started/eager}
 
 
 ## Common installation problems
@@ -581,7 +541,7 @@ ask a new question about it on Stack Overflow and specify
 the `tensorflow` tag.
 
 <table>
-<tr> <th>Stack Overflow Link</th> <th>Error Message</th> </tr>
+<tr> <th>Link to GitHub or Stack&nbsp;Overflow</th> <th>Error Message</th> </tr>
 
 <tr>
   <td><a href="https://stackoverflow.com/q/36159194">36159194</a></td>
-- 
GitLab


From 9c5c558cba9069dfedfde9431ed13227b3893bbf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 22:36:35 -0700
Subject: [PATCH 345/450] Make ClientLibraryTestBase::CreateScalarRelu return
 XlaComputation.

PiperOrigin-RevId: 194036707
---
 tensorflow/compiler/xla/tests/client_library_test_base.cc | 4 ++--
 tensorflow/compiler/xla/tests/client_library_test_base.h  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index 31c9e21644..c09a6d71c9 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -621,8 +621,8 @@ ClientLibraryTestBase::ComputeValueAndReference(
   return std::make_pair(std::move(reference), std::move(result));
 }
 
-Computation ClientLibraryTestBase::CreateScalarRelu() {
-  ComputationBuilder builder(client_, "relu");
+XlaComputation ClientLibraryTestBase::CreateScalarRelu() {
+  XlaBuilder builder("relu");
   auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {});
   auto z_value = builder.Parameter(0, shape, "z_value");
   auto zero = use_bfloat16_
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 85ebe29ae9..c303a4562e 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -255,7 +255,7 @@ class ClientLibraryTestBase : public ::testing::Test {
                          ErrorSpec error);
 
   // Create scalar operations for use in reductions.
-  Computation CreateScalarRelu();
+  XlaComputation CreateScalarRelu();
   Computation CreateScalarMax();
   Computation CreateScalarReluSensitivity();
 
-- 
GitLab


From d75f2bf9041c7d50c932e48a175c9d5ab0bd0075 Mon Sep 17 00:00:00 2001
From: Igor Ganichev <iga@google.com>
Date: Mon, 23 Apr 2018 22:36:39 -0700
Subject: [PATCH 346/450] Internal change

PiperOrigin-RevId: 194036710
---
 .../eager/python/examples/resnet50/BUILD      | 11 ++++++
 .../python/examples/resnet50/resnet50_test.py | 34 ++++++++++---------
 2 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/tensorflow/contrib/eager/python/examples/resnet50/BUILD b/tensorflow/contrib/eager/python/examples/resnet50/BUILD
index 536cad998d..0c0e28dd95 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/BUILD
+++ b/tensorflow/contrib/eager/python/examples/resnet50/BUILD
@@ -14,6 +14,17 @@ py_library(
     ],
 )
 
+py_library(
+    name = "resnet50_test_lib",
+    srcs = ["resnet50_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":resnet50",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/eager/python:tfe",
+    ],
+)
+
 cuda_py_test(
     name = "resnet50_test",
     size = "large",
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
index d6923293a3..09a0cd88d8 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
@@ -36,8 +36,8 @@ def device_and_data_format():
                                                               'channels_last')
 
 
-def random_batch(batch_size):
-  _, data_format = device_and_data_format()
+def random_batch(batch_size, device_and_format=None):
+  _, data_format = device_and_format or device_and_data_format()
 
   shape = (3, 224, 224) if data_format == 'channels_first' else (224, 224, 3)
   shape = (batch_size,) + shape
@@ -184,22 +184,23 @@ class ResNet50Benchmarks(tf.test.Benchmark):
 
   def _report(self, label, start, num_iters, device, batch_size, data_format):
     avg_time = (time.time() - start) / num_iters
-    dev = 'cpu' if 'cpu' in device else 'gpu'
+    dev = tf.DeviceSpec.from_string(device).device_type.lower()
     name = '%s_%s_batch_%d_%s' % (label, dev, batch_size, data_format)
     extras = {'examples_per_sec': batch_size / avg_time}
     self.report_benchmark(
         iters=num_iters, wall_time=avg_time, name=name, extras=extras)
 
-  def _force_gpu_sync(self):
-    # If this function is called in the context of a GPU device
+  def _force_device_sync(self):
+    # If this function is called in the context of a non-CPU device
     # (e.g., inside a 'with tf.device("/gpu:0")' block)
-    # then this will force a copy from CPU->GPU->CPU, which forces
-    # a sync. This is a roundabout way, yes.
+    # then this will force a copy from CPU->NON_CPU_DEVICE->CPU,
+    # which forces a sync. This is a roundabout way, yes.
     tf.constant(1.).cpu()
 
-  def _benchmark_eager_apply(self, label, defun=False, execution_mode=None):
+  def _benchmark_eager_apply(self, label, defun=False, execution_mode=None,
+                             device_and_format=None):
     with tfe.execution_mode(execution_mode):
-      device, data_format = device_and_data_format()
+      device, data_format = device_and_format or device_and_data_format()
       model = resnet50.ResNet50(data_format)
       if defun:
         model.call = tfe.defun(model.call)
@@ -207,7 +208,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
       num_burn = 5
       num_iters = 30
       with tf.device(device):
-        images, _ = random_batch(batch_size)
+        images, _ = random_batch(batch_size, device_and_format)
         for _ in xrange(num_burn):
           model(images, training=False).cpu()
         if execution_mode:
@@ -220,7 +221,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
           tfe.async_wait()
         self._report(label, start, num_iters, device, batch_size, data_format)
 
-  def benchmark_eager_apply(self):
+  def benchmark_eager_apply_sync(self):
     self._benchmark_eager_apply('eager_apply', defun=False)
 
   def benchmark_eager_apply_async(self):
@@ -234,11 +235,12 @@ class ResNet50Benchmarks(tf.test.Benchmark):
                              label,
                              make_iterator,
                              defun=False,
-                             execution_mode=None):
+                             execution_mode=None,
+                             device_and_format=None):
     with tfe.execution_mode(execution_mode):
-      device, data_format = device_and_data_format()
+      device, data_format = device_and_format or device_and_data_format()
       for batch_size in self._train_batch_sizes():
-        (images, labels) = random_batch(batch_size)
+        (images, labels) = random_batch(batch_size, device_and_format)
         num_burn = 3
         num_iters = 10
         model = resnet50.ResNet50(data_format)
@@ -253,7 +255,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
             train_one_step(model, images, labels, optimizer)
           if execution_mode:
             tfe.async_wait()
-          self._force_gpu_sync()
+          self._force_device_sync()
           gc.collect()
 
           start = time.time()
@@ -262,7 +264,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
             train_one_step(model, images, labels, optimizer)
           if execution_mode:
             tfe.async_wait()
-          self._force_gpu_sync()
+          self._force_device_sync()
           self._report(label, start, num_iters, device, batch_size, data_format)
 
   def benchmark_eager_train(self):
-- 
GitLab


From 969be44f38d566b46b2d8a15958fd10db2b108fb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 23 Apr 2018 23:18:11 -0700
Subject: [PATCH 347/450] Update ops-related pbtxt files.

PiperOrigin-RevId: 194039856
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 194 ++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 194 ++++++++++++++++++
 2 files changed, 388 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 247f9edf5b..05dee30ca0 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -1534,6 +1534,85 @@ op {
     }
   }
 }
+op {
+  name: "ApplyAdaMax"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "ApplyAdadelta"
   input_arg {
@@ -11234,6 +11313,38 @@ op {
     }
   }
 }
+op {
+  name: "BroadcastTo"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Bucketize"
   input_arg {
@@ -42885,6 +42996,78 @@ op {
     }
   }
 }
+op {
+  name: "ResourceApplyAdaMax"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceApplyAdadelta"
   input_arg {
@@ -66434,6 +66617,17 @@ op {
     }
   }
 }
+op {
+  name: "StringStrip"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+}
 op {
   name: "StringToHashBucket"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index d1773daebe..2edd15c446 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -684,6 +684,85 @@ op {
     }
   }
 }
+op {
+  name: "ApplyAdaMax"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "ApplyAdadelta"
   input_arg {
@@ -4388,6 +4467,38 @@ op {
     }
   }
 }
+op {
+  name: "BroadcastTo"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Bucketize"
   input_arg {
@@ -21487,6 +21598,78 @@ op {
     }
   }
 }
+op {
+  name: "ResourceApplyAdaMax"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceApplyAdadelta"
   input_arg {
@@ -30483,6 +30666,17 @@ op {
     }
   }
 }
+op {
+  name: "StringStrip"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+}
 op {
   name: "StringToHashBucket"
   input_arg {
-- 
GitLab


From aab0ef354b628ff4d88ab7f90b2d5bdcc440b6de Mon Sep 17 00:00:00 2001
From: Igor Ganichev <iga@google.com>
Date: Tue, 24 Apr 2018 00:15:19 -0700
Subject: [PATCH 348/450] Internal Change

PiperOrigin-RevId: 194043623
---
 .../eager/python/examples/resnet50/resnet50_test.py      | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
index 09a0cd88d8..8517a3bf7b 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
@@ -169,7 +169,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
   def _train_batch_sizes(self):
     """Choose batch sizes based on GPU capability."""
     for device in device_lib.list_local_devices():
-      if 'GPU:0' in device.name:
+      if tf.DeviceSpec.from_string(device.name).device_type == 'GPU':
         # Avoid OOM errors with larger batch sizes, which seem to cause errors
         # later on even if caught.
         #
@@ -180,6 +180,11 @@ class ResNet50Benchmarks(tf.test.Benchmark):
           return (16,)
         if 'P100' in device.physical_device_desc:
           return (16, 32, 64)
+
+      if tf.DeviceSpec.from_string(device.name).device_type == 'TPU':
+        # TODO(iga): Training fails with batch size of 16, probably because of
+        # no layout optimizations with op-by-op mode. Investigate more.
+        return (8,)
     return (16, 32)
 
   def _report(self, label, start, num_iters, device, batch_size, data_format):
@@ -267,7 +272,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
           self._force_device_sync()
           self._report(label, start, num_iters, device, batch_size, data_format)
 
-  def benchmark_eager_train(self):
+  def benchmark_eager_train_sync(self):
     self._benchmark_eager_train('eager_train', MockIterator, defun=False)
 
   def benchmark_eager_train_async(self):
-- 
GitLab


From 8f20757e9bff4e2f2cdaf1a2e655eb7e0c17b68c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 02:00:06 -0700
Subject: [PATCH 349/450] Moving the Var class to framework so that it can be
 part of framework_headers_lib and accessible from contrib.

PiperOrigin-RevId: 194054227
---
 tensorflow/core/framework/resource_var.h | 58 ++++++++++++++++++++++++
 tensorflow/core/kernels/variable_ops.h   | 34 +-------------
 2 files changed, 59 insertions(+), 33 deletions(-)
 create mode 100644 tensorflow/core/framework/resource_var.h

diff --git a/tensorflow/core/framework/resource_var.h b/tensorflow/core/framework/resource_var.h
new file mode 100644
index 0000000000..872b8f8b30
--- /dev/null
+++ b/tensorflow/core/framework/resource_var.h
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_RESOURCE_VAR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_RESOURCE_VAR_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+
+namespace tensorflow {
+
+// Resource stored by variables in the resource manager
+// (new, resource-style version).
+class Var : public ResourceBase {
+ public:
+  explicit Var(DataType dtype) : tensor_(dtype) {}
+  // Not copyable or movable.
+  Var(const Var&) = delete;
+  Var& operator=(const Var&) = delete;
+
+  // TODO(ebrevdo): Use LockSet instead of exposing mu.
+  mutex* mu() { return &mu_; }
+  Tensor* tensor() { return &tensor_; }
+
+  string DebugString() override {
+    return strings::StrCat(DataTypeString(tensor_.dtype()), "/",
+                           tensor_.shape().DebugString());
+  }
+
+  // Only used in the resource variable path. In resource variables,
+  // tensor.IsInitialized() can be true (i.e. have memory allocated to it) while
+  // there is not a good value there due to a race condition, and it's possible
+  // to stumble upon this during variable.initialized_value(). So it's best to
+  // just store directly whether the variable is initialized.
+  bool is_initialized = false;  // GUARDED_BY(mu_) but annotalysis doesn't like
+                                // it.
+
+ private:
+  mutex mu_;
+  Tensor tensor_;
+
+  ~Var() override {}
+};
+
+}  //  end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_RESOURCE_VAR_H_
diff --git a/tensorflow/core/kernels/variable_ops.h b/tensorflow/core/kernels/variable_ops.h
index 8b406e5311..f27dab4ddd 100644
--- a/tensorflow/core/kernels/variable_ops.h
+++ b/tensorflow/core/kernels/variable_ops.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/resource_var.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -27,39 +28,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Resource stored by variables in the resource manager
-// (new, resource-style version).
-class Var : public ResourceBase {
- public:
-  explicit Var(DataType dtype) : tensor_(dtype) {}
-  // Not copyable or movable.
-  Var(const Var&) = delete;
-  Var& operator=(const Var&) = delete;
-
-  // TODO(ebrevdo): Use LockSet instead of exposing mu.
-  mutex* mu() { return &mu_; }
-  Tensor* tensor() { return &tensor_; }
-
-  string DebugString() override {
-    return strings::StrCat(DataTypeString(tensor_.dtype()), "/",
-                           tensor_.shape().DebugString());
-  }
-
-  // Only used in the resource variable path. In resource variables,
-  // tensor.IsInitialized() can be true (i.e. have memory allocated to it) while
-  // there is not a good value there due to a race condition, and it's possible
-  // to stumble upon this during variable.initialized_value(). So it's best to
-  // just store directly whether the variable is initialized.
-  bool is_initialized = false;  // GUARDED_BY(mu_) but annotalysis doesn't like
-                                // it.
-
- private:
-  mutex mu_;
-  Tensor tensor_;
-
-  ~Var() override {}
-};
-
 class VariableOp : public OpKernel {
  public:
   explicit VariableOp(OpKernelConstruction* context);
-- 
GitLab


From e74b98ba6348d869fee50b95b7795885fdedecee Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Tue, 24 Apr 2018 04:33:16 -0700
Subject: [PATCH 350/450] Automated g4 rollback of changelist 193718607

PiperOrigin-RevId: 194068437
---
 .../core/distributed_runtime/master_session.cc     | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index e3022f38a2..83afc5b1a4 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -89,6 +89,10 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   ~ReffedClientGraph() override {
     if (should_deregister_) {
       DeregisterPartitions();
+    } else {
+      for (Part& part : partitions_) {
+        worker_cache_->ReleaseWorker(part.name, part.worker);
+      }
     }
   }
 
@@ -1174,14 +1178,8 @@ Status MasterSession::Create(GraphDef* graph_def,
     TF_RETURN_IF_ERROR(GraphExecutionState::MakeForBaseGraph(
         graph_def, execution_options, &execution_state_));
   }
-  // TODO(b/36574172): Remove these conditions when ClusterSpec
-  // propagation is supported in all servers.
-  if (options.cluster_def != nullptr ||
-      session_opts_.config.isolate_session_state()) {
-    should_delete_worker_sessions_ = true;
-    return CreateWorkerSessions(options);
-  }
-  return Status::OK();
+  should_delete_worker_sessions_ = true;
+  return CreateWorkerSessions(options);
 }
 
 Status MasterSession::CreateWorkerSessions(
-- 
GitLab


From 9f38ab74161a0e8dd0b35b47f23ddeda7b286af3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 04:35:39 -0700
Subject: [PATCH 351/450] Add variants of DoBlasGemmWithAlgorithm with alpha
 being on device.

This is in preparation of allowing XLA to fuse (A dot b) * alpha where alpha
can be on device instead of just a constant.

PiperOrigin-RevId: 194068597
---
 tensorflow/stream_executor/blas.h             |  81 ++++++++-----
 tensorflow/stream_executor/cuda/cuda_blas.cc  |  81 ++++++++-----
 tensorflow/stream_executor/cuda/cuda_blas.h   |  14 +--
 .../stream_executor/host_or_device_scalar.h   |  56 +++++++++
 tensorflow/stream_executor/stream.cc          | 114 +++++++++++-------
 tensorflow/stream_executor/stream.h           |  62 +++++-----
 6 files changed, 263 insertions(+), 145 deletions(-)
 create mode 100644 tensorflow/stream_executor/host_or_device_scalar.h

diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h
index 6e62b85728..be0b0bf5fb 100644
--- a/tensorflow/stream_executor/blas.h
+++ b/tensorflow/stream_executor/blas.h
@@ -41,9 +41,10 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_BLAS_H_
 
 #include <complex>
-#include "tensorflow/stream_executor/platform/port.h"
 
+#include "tensorflow/stream_executor/host_or_device_scalar.h"
 #include "tensorflow/stream_executor/lib/array_slice.h"
+#include "tensorflow/stream_executor/platform/port.h"
 
 namespace Eigen {
 struct half;
@@ -1032,43 +1033,49 @@ class BlasSupport {
   // creating a new Stream for each attempt.
   virtual bool DoBlasGemmWithAlgorithm(
       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-      uint64 n, uint64 k, int alpha, const DeviceMemory<int8> &a, int lda,
-      const DeviceMemory<int8> &b, int ldb, int beta, DeviceMemory<int32> *c,
+      uint64 n, uint64 k, const HostOrDeviceScalar<int> &alpha,
+      const DeviceMemory<int8> &a, int lda, const DeviceMemory<int8> &b,
+      int ldb, const HostOrDeviceScalar<int> &beta, DeviceMemory<int32> *c,
       int ldc, ComputationType computation_type, AlgorithmType algorithm,
       ProfileResult *output_profile_result) = 0;
   virtual bool DoBlasGemmWithAlgorithm(
       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-      uint64 n, uint64 k, const Eigen::half &alpha,
+      uint64 n, uint64 k, const HostOrDeviceScalar<Eigen::half> &alpha,
       const DeviceMemory<Eigen::half> &a, int lda,
-      const DeviceMemory<Eigen::half> &b, int ldb, const Eigen::half &beta,
-      DeviceMemory<Eigen::half> *c, int ldc, ComputationType computation_type,
-      AlgorithmType algorithm, ProfileResult *output_profile_result) = 0;
+      const DeviceMemory<Eigen::half> &b, int ldb,
+      const HostOrDeviceScalar<Eigen::half> &beta, DeviceMemory<Eigen::half> *c,
+      int ldc, ComputationType computation_type, AlgorithmType algorithm,
+      ProfileResult *output_profile_result) = 0;
   virtual bool DoBlasGemmWithAlgorithm(
       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-      uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
-      const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
+      uint64 n, uint64 k, const HostOrDeviceScalar<float> &alpha,
+      const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &b,
+      int ldb, const HostOrDeviceScalar<float> &beta, DeviceMemory<float> *c,
       int ldc, ComputationType computation_type, AlgorithmType algorithm,
       ProfileResult *output_profile_result) = 0;
   virtual bool DoBlasGemmWithAlgorithm(
       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-      uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
-      const DeviceMemory<double> &b, int ldb, double beta,
-      DeviceMemory<double> *c, int ldc, ComputationType computation_type,
-      AlgorithmType algorithm, ProfileResult *output_profile_result) = 0;
+      uint64 n, uint64 k, const HostOrDeviceScalar<double> &alpha,
+      const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,
+      int ldb, const HostOrDeviceScalar<double> &beta, DeviceMemory<double> *c,
+      int ldc, ComputationType computation_type, AlgorithmType algorithm,
+      ProfileResult *output_profile_result) = 0;
   virtual bool DoBlasGemmWithAlgorithm(
       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-      uint64 n, uint64 k, std::complex<float> alpha,
+      uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<float>> &alpha,
       const DeviceMemory<std::complex<float>> &a, int lda,
       const DeviceMemory<std::complex<float>> &b, int ldb,
-      std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+      const HostOrDeviceScalar<std::complex<float>> &beta,
+      DeviceMemory<std::complex<float>> *c, int ldc,
       ComputationType computation_type, AlgorithmType algorithm,
       ProfileResult *output_profile_result) = 0;
   virtual bool DoBlasGemmWithAlgorithm(
       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-      uint64 n, uint64 k, std::complex<double> alpha,
+      uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<double>> &alpha,
       const DeviceMemory<std::complex<double>> &a, int lda,
       const DeviceMemory<std::complex<double>> &b, int ldb,
-      std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+      const HostOrDeviceScalar<std::complex<double>> &beta,
+      DeviceMemory<std::complex<double>> *c, int ldc,
       ComputationType computation_type, AlgorithmType algorithm,
       ProfileResult *output_profile_result) = 0;
 
@@ -1886,49 +1893,57 @@ class BlasSupport {
       override;                                                                \
   bool DoBlasGemmWithAlgorithm(                                                \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64 m, uint64 n, uint64 k, int alpha, const DeviceMemory<int8> &a,    \
-      int lda, const DeviceMemory<int8> &b, int ldb, int beta,                 \
-      DeviceMemory<int> *c, int ldc, blas::ComputationType computation_type,   \
+      uint64 m, uint64 n, uint64 k, const HostOrDeviceScalar<int> &alpha,      \
+      const DeviceMemory<int8> &a, int lda, const DeviceMemory<int8> &b,       \
+      int ldb, const HostOrDeviceScalar<int> &beta, DeviceMemory<int> *c,      \
+      int ldc, blas::ComputationType computation_type,                         \
       blas::AlgorithmType algorithm,                                           \
       blas::ProfileResult *output_profile_result) override;                    \
   bool DoBlasGemmWithAlgorithm(                                                \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64 m, uint64 n, uint64 k, const Eigen::half &alpha,                  \
+      uint64 m, uint64 n, uint64 k,                                            \
+      const HostOrDeviceScalar<Eigen::half> &alpha,                            \
       const DeviceMemory<Eigen::half> &a, int lda,                             \
-      const DeviceMemory<Eigen::half> &b, int ldb, const Eigen::half &beta,    \
+      const DeviceMemory<Eigen::half> &b, int ldb,                             \
+      const HostOrDeviceScalar<Eigen::half> &beta,                             \
       DeviceMemory<Eigen::half> *c, int ldc,                                   \
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
       blas::ProfileResult *output_profile_result) override;                    \
   bool DoBlasGemmWithAlgorithm(                                                \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64 m, uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, \
-      int lda, const DeviceMemory<float> &b, int ldb, float beta,              \
-      DeviceMemory<float> *c, int ldc, blas::ComputationType computation_type, \
+      uint64 m, uint64 n, uint64 k, const HostOrDeviceScalar<float> &alpha,    \
+      const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &b,     \
+      int ldb, const HostOrDeviceScalar<float> &beta, DeviceMemory<float> *c,  \
+      int ldc, blas::ComputationType computation_type,                         \
       blas::AlgorithmType algorithm,                                           \
       blas::ProfileResult *output_profile_result) override;                    \
   bool DoBlasGemmWithAlgorithm(                                                \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64 m, uint64 n, uint64 k, double alpha,                              \
+      uint64 m, uint64 n, uint64 k, const HostOrDeviceScalar<double> &alpha,   \
       const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,   \
-      int ldb, double beta, DeviceMemory<double> *c, int ldc,                  \
+      int ldb, const HostOrDeviceScalar<double> &beta,                         \
+      DeviceMemory<double> *c, int ldc,                                        \
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
       blas::ProfileResult *output_profile_result) override;                    \
   bool DoBlasGemmWithAlgorithm(                                                \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64 m, uint64 n, uint64 k, std::complex<float> alpha,                 \
+      uint64 m, uint64 n, uint64 k,                                            \
+      const HostOrDeviceScalar<std::complex<float>> &alpha,                    \
       const DeviceMemory<std::complex<float>> &a, int lda,                     \
       const DeviceMemory<std::complex<float>> &b, int ldb,                     \
-      std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc, \
+      const HostOrDeviceScalar<std::complex<float>> &beta,                     \
+      DeviceMemory<std::complex<float>> *c, int ldc,                           \
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
       blas::ProfileResult *output_profile_result) override;                    \
   bool DoBlasGemmWithAlgorithm(                                                \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64 m, uint64 n, uint64 k, std::complex<double> alpha,                \
+      uint64 m, uint64 n, uint64 k,                                            \
+      const HostOrDeviceScalar<std::complex<double>> &alpha,                   \
       const DeviceMemory<std::complex<double>> &a, int lda,                    \
       const DeviceMemory<std::complex<double>> &b, int ldb,                    \
-      std::complex<double> beta, DeviceMemory<std::complex<double>> *c,        \
-      int ldc, blas::ComputationType computation_type,                         \
-      blas::AlgorithmType algorithm,                                           \
+      const HostOrDeviceScalar<std::complex<double>> &beta,                    \
+      DeviceMemory<std::complex<double>> *c, int ldc,                          \
+      blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
       blas::ProfileResult *output_profile_result) override;                    \
   bool DoBlasGemmBatched(                                                      \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 007c0f1c86..3c1353aee3 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -2156,10 +2156,11 @@ static bool TensorOpsAvailable(int cc_major) {
 template <typename InT, typename OutT, typename CompT>
 bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-    uint64 n, uint64 k, const CompT &alpha, const DeviceMemory<InT> &a, int lda,
-    const DeviceMemory<InT> &b, int ldb, const CompT &beta,
-    DeviceMemory<OutT> *c, int ldc, blas::ComputationType computation_type,
-    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+    uint64 n, uint64 k, const HostOrDeviceScalar<CompT> &alpha,
+    const DeviceMemory<InT> &a, int lda, const DeviceMemory<InT> &b, int ldb,
+    const HostOrDeviceScalar<CompT> &beta, DeviceMemory<OutT> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
 // CUDA < version 8 and GPUs < sm_50 don't support cublasGemmEx.
 #if CUDA_VERSION < 8000
   return false;
@@ -2175,6 +2176,12 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
     return false;
   }
 
+  // Either both 'alpha' and 'beta' need to be pointers to device memory, or
+  // they need to be both host scalars.
+  if (alpha.is_pointer() != beta.is_pointer()) {
+    return false;
+  }
+
   std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (output_profile_result != nullptr) {
     timer.reset(new CUDATimer(parent_));
@@ -2187,10 +2194,15 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
   // Since we are converting 'algorithm' to cublasGemmAlgo_t by static_cast,
   // we do the following compile-time check on the default value:
   static_assert(blas::kDefaultGemmAlgo == CUBLAS_GEMM_DFALT, "");
+  // If 'alpha' and 'beta' are host scalars and CompT is Eigen::half, we
+  // essentially reinterpet_cast to __half, which is safe because Eigen::half
+  // inherits from __half.
   bool result = DoBlasInternalFailureOK(
-      wrap::cublasGemmEx, stream, /* pointer_mode_host = */ true,
-      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-      CUDAMemory(a), cuda_in_type, lda, CUDAMemory(b), cuda_in_type, ldb, &beta,
+      wrap::cublasGemmEx, stream, /* pointer_mode_host = */ !alpha.is_pointer(),
+      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
+      alpha.is_pointer() ? CUDAMemory(alpha.pointer()) : &alpha.value(),
+      CUDAMemory(a), cuda_in_type, lda, CUDAMemory(b), cuda_in_type, ldb,
+      beta.is_pointer() ? CUDAMemory(beta.pointer()) : &beta.value(),
       CUDAMemoryMutable(c), CUDADataType<OutT>::type, ldc,
       CUDAComputationType(computation_type),
       static_cast<cublasGemmAlgo_t>(algorithm));
@@ -2239,10 +2251,11 @@ bool CUDABlas::GetBlasGemmAlgorithms(
 
 bool CUDABlas::DoBlasGemmWithAlgorithm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-    uint64 n, uint64 k, int alpha, const DeviceMemory<int8> &a, int lda,
-    const DeviceMemory<int8> &b, int ldb, int beta, DeviceMemory<int> *c,
-    int ldc, blas::ComputationType computation_type,
-    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+    uint64 n, uint64 k, const HostOrDeviceScalar<int> &alpha,
+    const DeviceMemory<int8> &a, int lda, const DeviceMemory<int8> &b, int ldb,
+    const HostOrDeviceScalar<int> &beta, DeviceMemory<int> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
   return DoBlasGemmWithAlgorithmImpl(
       stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
       computation_type, algorithm, output_profile_result);
@@ -2250,17 +2263,25 @@ bool CUDABlas::DoBlasGemmWithAlgorithm(
 
 bool CUDABlas::DoBlasGemmWithAlgorithm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-    uint64 n, uint64 k, const Eigen::half &alpha,
+    uint64 n, uint64 k, const HostOrDeviceScalar<Eigen::half> &alpha,
     const DeviceMemory<Eigen::half> &a, int lda,
-    const DeviceMemory<Eigen::half> &b, int ldb, const Eigen::half &beta,
-    DeviceMemory<Eigen::half> *c, int ldc,
-    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
-    blas::ProfileResult *output_profile_result) {
+    const DeviceMemory<Eigen::half> &b, int ldb,
+    const HostOrDeviceScalar<Eigen::half> &beta, DeviceMemory<Eigen::half> *c,
+    int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
   if (computation_type == blas::ComputationType::kF32) {
+    if (alpha.is_pointer() || beta.is_pointer()) {
+      // We cannot easily convert a pointer to f16 memory to a pointer to f32
+      // memory from here, so we don't support this for now.
+      // TODO(akuegel): Investigate whether we can do the conversion before
+      // calling DoBlasGemmWithAlgorithm.
+      return false;
+    }
+    HostOrDeviceScalar<float> float_alpha(static_cast<float>(alpha.value()));
+    HostOrDeviceScalar<float> float_beta(static_cast<float>(beta.value()));
     return DoBlasGemmWithAlgorithmImpl(
-        stream, transa, transb, m, n, k, static_cast<float>(alpha), a, lda, b,
-        ldb, static_cast<float>(beta), c, ldc, computation_type, algorithm,
-        output_profile_result);
+        stream, transa, transb, m, n, k, float_alpha, a, lda, b, ldb,
+        float_beta, c, ldc, computation_type, algorithm, output_profile_result);
   }
 
   CHECK_EQ(computation_type, blas::ComputationType::kF16);
@@ -2271,8 +2292,9 @@ bool CUDABlas::DoBlasGemmWithAlgorithm(
 
 bool CUDABlas::DoBlasGemmWithAlgorithm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-    uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
-    const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
+    uint64 n, uint64 k, const HostOrDeviceScalar<float> &alpha,
+    const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &b,
+    int ldb, const HostOrDeviceScalar<float> &beta, DeviceMemory<float> *c,
     int ldc, blas::ComputationType computation_type,
     blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
   return DoBlasGemmWithAlgorithmImpl(
@@ -2282,9 +2304,10 @@ bool CUDABlas::DoBlasGemmWithAlgorithm(
 
 bool CUDABlas::DoBlasGemmWithAlgorithm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-    uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
-    const DeviceMemory<double> &b, int ldb, double beta,
-    DeviceMemory<double> *c, int ldc, blas::ComputationType computation_type,
+    uint64 n, uint64 k, const HostOrDeviceScalar<double> &alpha,
+    const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,
+    int ldb, const HostOrDeviceScalar<double> &beta, DeviceMemory<double> *c,
+    int ldc, blas::ComputationType computation_type,
     blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
   return DoBlasGemmWithAlgorithmImpl(
       stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
@@ -2293,10 +2316,11 @@ bool CUDABlas::DoBlasGemmWithAlgorithm(
 
 bool CUDABlas::DoBlasGemmWithAlgorithm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-    uint64 n, uint64 k, std::complex<float> alpha,
+    uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<float>> &alpha,
     const DeviceMemory<std::complex<float>> &a, int lda,
     const DeviceMemory<std::complex<float>> &b, int ldb,
-    std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+    const HostOrDeviceScalar<std::complex<float>> &beta,
+    DeviceMemory<std::complex<float>> *c, int ldc,
     blas::ComputationType computation_type, blas::AlgorithmType algorithm,
     blas::ProfileResult *output_profile_result) {
   return DoBlasGemmWithAlgorithmImpl(
@@ -2306,10 +2330,11 @@ bool CUDABlas::DoBlasGemmWithAlgorithm(
 
 bool CUDABlas::DoBlasGemmWithAlgorithm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-    uint64 n, uint64 k, std::complex<double> alpha,
+    uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<double>> &alpha,
     const DeviceMemory<std::complex<double>> &a, int lda,
     const DeviceMemory<std::complex<double>> &b, int ldb,
-    std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+    const HostOrDeviceScalar<std::complex<double>> &beta,
+    DeviceMemory<std::complex<double>> *c, int ldc,
     blas::ComputationType computation_type, blas::AlgorithmType algorithm,
     blas::ProfileResult *output_profile_result) {
   return DoBlasGemmWithAlgorithmImpl(
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 55c414a1f9..12dc5e47fd 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -21,6 +21,7 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
 
 #include "tensorflow/stream_executor/blas.h"
+#include "tensorflow/stream_executor/host_or_device_scalar.h"
 #include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
@@ -116,18 +117,13 @@ class CUDABlas : public blas::BlasSupport {
       int batch_count, ScratchAllocator *scratch_allocator);
 
   // Helper function for implementing DoBlasGemmWithAlgorithm.
-  //
-  // We take alpha and beta by const reference because T might be Eigen::half,
-  // and we want to avoid pulling in a dependency on Eigen.  When we pass the
-  // references to cublas, we essentially reinterpret_cast to __half, which is
-  // safe because Eigen::half inherits from __half.
   template <typename InT, typename OutT, typename CompT>
   bool DoBlasGemmWithAlgorithmImpl(
       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-      uint64 n, uint64 k, const CompT &alpha, const DeviceMemory<InT> &a,
-      int lda, const DeviceMemory<InT> &b, int ldb, const CompT &beta,
-      DeviceMemory<OutT> *c, int ldc, blas::ComputationType computation_type,
-      blas::AlgorithmType algorithm,
+      uint64 n, uint64 k, const HostOrDeviceScalar<CompT> &alpha,
+      const DeviceMemory<InT> &a, int lda, const DeviceMemory<InT> &b, int ldb,
+      const HostOrDeviceScalar<CompT> &beta, DeviceMemory<OutT> *c, int ldc,
+      blas::ComputationType computation_type, blas::AlgorithmType algorithm,
       blas::ProfileResult *output_profile_result);
 
   // Helper function for implementing DoBlasGemmWithProfiling.
diff --git a/tensorflow/stream_executor/host_or_device_scalar.h b/tensorflow/stream_executor/host_or_device_scalar.h
new file mode 100644
index 0000000000..c9e3e14778
--- /dev/null
+++ b/tensorflow/stream_executor/host_or_device_scalar.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
+#define TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/stream_executor/device_memory.h"
+
+namespace stream_executor {
+
+// Allows to represent a value that is either a host scalar or a scalar stored
+// on the GPU device.
+template <typename ElemT>
+class HostOrDeviceScalar {
+ public:
+  // Not marked as explicit because when using this constructor, we usually want
+  // to set this to a compile-time constant.
+  HostOrDeviceScalar(ElemT value) : value_(value), is_pointer_(false) {}
+  explicit HostOrDeviceScalar(const DeviceMemory<ElemT>& pointer)
+      : pointer_(pointer), is_pointer_(true) {
+    CHECK_EQ(1, pointer.ElementCount());
+  }
+
+  bool is_pointer() const { return is_pointer_; }
+  const DeviceMemory<ElemT>& pointer() const {
+    CHECK(is_pointer());
+    return pointer_;
+  }
+  const ElemT& value() const {
+    CHECK(!is_pointer());
+    return value_;
+  }
+
+ private:
+  union {
+    ElemT value_;
+    DeviceMemory<ElemT> pointer_;
+  };
+  bool is_pointer_;
+};
+
+}  // namespace stream_executor
+#endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index f59d9a13ac..093f0c9306 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/host_buffer.h"
+#include "tensorflow/stream_executor/host_or_device_scalar.h"
 #include "tensorflow/stream_executor/lib/stacktrace.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/platform.h"
@@ -133,6 +134,14 @@ string ToVlogString(float f) { return port::StrCat(f); }
 
 string ToVlogString(double d) { return port::StrCat(d); }
 
+template <typename T>
+string ToVlogString(const HostOrDeviceScalar<T> &memory_or_constant) {
+  if (memory_or_constant.is_pointer()) {
+    return ToVlogString(memory_or_constant.pointer());
+  }
+  return ToVlogString(memory_or_constant.value());
+}
+
 template <class T>
 string ToVlogString(port::ArraySlice<T> elements) {
   string str = port::StrCat(
@@ -3882,22 +3891,23 @@ Stream &Stream::ThenBlasGemmWithProfiling(
 
 Stream &Stream::ThenBlasGemmWithAlgorithm(
     blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
-    uint64 k, const Eigen::half &alpha, const DeviceMemory<Eigen::half> &a,
-    int lda, const DeviceMemory<Eigen::half> &b, int ldb,
-    const Eigen::half &beta, DeviceMemory<Eigen::half> *c, int ldc,
-    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
-    blas::ProfileResult *output_profile_result) {
+    uint64 k, const HostOrDeviceScalar<Eigen::half> &alpha,
+    const DeviceMemory<Eigen::half> &a, int lda,
+    const DeviceMemory<Eigen::half> &b, int ldb,
+    const HostOrDeviceScalar<Eigen::half> &beta, DeviceMemory<Eigen::half> *c,
+    int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
             PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type),
             PARAM(algorithm));
 
-  ThenBlasWithProfileImpl<blas::Transpose, blas::Transpose, uint64, uint64,
-                          uint64, const Eigen::half &,
-                          const DeviceMemory<Eigen::half> &, int,
-                          const DeviceMemory<Eigen::half> &, int,
-                          const Eigen::half &, DeviceMemory<Eigen::half> *, int,
-                          blas::ComputationType, blas::AlgorithmType>
+  ThenBlasWithProfileImpl<
+      blas::Transpose, blas::Transpose, uint64, uint64, uint64,
+      const HostOrDeviceScalar<Eigen::half> &,
+      const DeviceMemory<Eigen::half> &, int, const DeviceMemory<Eigen::half> &,
+      int, const HostOrDeviceScalar<Eigen::half> &, DeviceMemory<Eigen::half> *,
+      int, blas::ComputationType, blas::AlgorithmType>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb,
               m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, computation_type,
@@ -3906,18 +3916,20 @@ Stream &Stream::ThenBlasGemmWithAlgorithm(
 
 Stream &Stream::ThenBlasGemmWithAlgorithm(
     blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
-    uint64 k, int alpha, const DeviceMemory<int8> &a, int lda,
-    const DeviceMemory<int8> &b, int ldb, int beta, DeviceMemory<int> *c,
-    int ldc, blas::ComputationType computation_type,
-    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+    uint64 k, const HostOrDeviceScalar<int> &alpha, const DeviceMemory<int8> &a,
+    int lda, const DeviceMemory<int8> &b, int ldb,
+    const HostOrDeviceScalar<int> &beta, DeviceMemory<int> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
             PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type),
             PARAM(algorithm));
 
   ThenBlasWithProfileImpl<
-      blas::Transpose, blas::Transpose, uint64, uint64, uint64, int,
-      const DeviceMemory<int8> &, int, const DeviceMemory<int8> &, int, int,
+      blas::Transpose, blas::Transpose, uint64, uint64, uint64,
+      const HostOrDeviceScalar<int> &, const DeviceMemory<int8> &, int,
+      const DeviceMemory<int8> &, int, const HostOrDeviceScalar<int> &,
       DeviceMemory<int> *, int, blas::ComputationType, blas::AlgorithmType>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb,
@@ -3927,8 +3939,9 @@ Stream &Stream::ThenBlasGemmWithAlgorithm(
 
 Stream &Stream::ThenBlasGemmWithAlgorithm(
     blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
-    uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
-    const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
+    uint64 k, const HostOrDeviceScalar<float> &alpha,
+    const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &b,
+    int ldb, const HostOrDeviceScalar<float> &beta, DeviceMemory<float> *c,
     int ldc, blas::ComputationType computation_type,
     blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
@@ -3937,8 +3950,9 @@ Stream &Stream::ThenBlasGemmWithAlgorithm(
             PARAM(algorithm));
 
   ThenBlasWithProfileImpl<
-      blas::Transpose, blas::Transpose, uint64, uint64, uint64, float,
-      const DeviceMemory<float> &, int, const DeviceMemory<float> &, int, float,
+      blas::Transpose, blas::Transpose, uint64, uint64, uint64,
+      const HostOrDeviceScalar<float> &, const DeviceMemory<float> &, int,
+      const DeviceMemory<float> &, int, const HostOrDeviceScalar<float> &,
       DeviceMemory<float> *, int, blas::ComputationType, blas::AlgorithmType>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb,
@@ -3948,32 +3962,35 @@ Stream &Stream::ThenBlasGemmWithAlgorithm(
 
 Stream &Stream::ThenBlasGemmWithAlgorithm(
     blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
-    uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
-    const DeviceMemory<double> &b, int ldb, double beta,
-    DeviceMemory<double> *c, int ldc, blas::ComputationType computation_type,
+    uint64 k, const HostOrDeviceScalar<double> &alpha,
+    const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,
+    int ldb, const HostOrDeviceScalar<double> &beta, DeviceMemory<double> *c,
+    int ldc, blas::ComputationType computation_type,
     blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
             PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type),
             PARAM(algorithm));
 
-  ThenBlasWithProfileImpl<blas::Transpose, blas::Transpose, uint64, uint64,
-                          uint64, double, const DeviceMemory<double> &, int,
-                          const DeviceMemory<double> &, int, double,
-                          DeviceMemory<double> *, int, blas::ComputationType,
-                          blas::AlgorithmType>
+  ThenBlasWithProfileImpl<
+      blas::Transpose, blas::Transpose, uint64, uint64, uint64,
+      const HostOrDeviceScalar<double> &, const DeviceMemory<double> &, int,
+      const DeviceMemory<double> &, int, const HostOrDeviceScalar<double> &,
+      DeviceMemory<double> *, int, blas::ComputationType, blas::AlgorithmType>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb,
-              m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, computation_type,
+              m, n, k, HostOrDeviceScalar<double>(alpha), a, lda, b, ldb,
+              HostOrDeviceScalar<double>(beta), c, ldc, computation_type,
               algorithm, output_profile_result);
 }
 
 Stream &Stream::ThenBlasGemmWithAlgorithm(
     blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
-    uint64 k, std::complex<float> alpha,
+    uint64 k, const HostOrDeviceScalar<std::complex<float>> &alpha,
     const DeviceMemory<std::complex<float>> &a, int lda,
     const DeviceMemory<std::complex<float>> &b, int ldb,
-    std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+    const HostOrDeviceScalar<std::complex<float>> &beta,
+    DeviceMemory<std::complex<float>> *c, int ldc,
     blas::ComputationType computation_type, blas::AlgorithmType algorithm,
     blas::ProfileResult *output_profile_result) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
@@ -3981,12 +3998,14 @@ Stream &Stream::ThenBlasGemmWithAlgorithm(
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type),
             PARAM(algorithm));
 
-  ThenBlasWithProfileImpl<
-      blas::Transpose, blas::Transpose, uint64, uint64, uint64,
-      std::complex<float>, const DeviceMemory<std::complex<float>> &, int,
-      const DeviceMemory<std::complex<float>> &, int, std::complex<float>,
-      DeviceMemory<std::complex<float>> *, int, blas::ComputationType,
-      blas::AlgorithmType>
+  ThenBlasWithProfileImpl<blas::Transpose, blas::Transpose, uint64, uint64,
+                          uint64,
+                          const HostOrDeviceScalar<std::complex<float>> &,
+                          const DeviceMemory<std::complex<float>> &, int,
+                          const DeviceMemory<std::complex<float>> &, int,
+                          const HostOrDeviceScalar<std::complex<float>> &,
+                          DeviceMemory<std::complex<float>> *, int,
+                          blas::ComputationType, blas::AlgorithmType>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb,
               m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, computation_type,
@@ -3995,10 +4014,11 @@ Stream &Stream::ThenBlasGemmWithAlgorithm(
 
 Stream &Stream::ThenBlasGemmWithAlgorithm(
     blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
-    uint64 k, std::complex<double> alpha,
+    uint64 k, const HostOrDeviceScalar<std::complex<double>> &alpha,
     const DeviceMemory<std::complex<double>> &a, int lda,
     const DeviceMemory<std::complex<double>> &b, int ldb,
-    std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+    const HostOrDeviceScalar<std::complex<double>> &beta,
+    DeviceMemory<std::complex<double>> *c, int ldc,
     blas::ComputationType computation_type, blas::AlgorithmType algorithm,
     blas::ProfileResult *output_profile_result) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
@@ -4006,12 +4026,14 @@ Stream &Stream::ThenBlasGemmWithAlgorithm(
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type),
             PARAM(algorithm));
 
-  ThenBlasWithProfileImpl<
-      blas::Transpose, blas::Transpose, uint64, uint64, uint64,
-      std::complex<double>, const DeviceMemory<std::complex<double>> &, int,
-      const DeviceMemory<std::complex<double>> &, int, std::complex<double>,
-      DeviceMemory<std::complex<double>> *, int, blas::ComputationType,
-      blas::AlgorithmType>
+  ThenBlasWithProfileImpl<blas::Transpose, blas::Transpose, uint64, uint64,
+                          uint64,
+                          const HostOrDeviceScalar<std::complex<double>> &,
+                          const DeviceMemory<std::complex<double>> &, int,
+                          const DeviceMemory<std::complex<double>> &, int,
+                          const HostOrDeviceScalar<std::complex<double>> &,
+                          DeviceMemory<std::complex<double>> *, int,
+                          blas::ComputationType, blas::AlgorithmType>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb,
               m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, computation_type,
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index d4a81440e9..3d1b011c57 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/dnn.h"
 #include "tensorflow/stream_executor/event.h"
 #include "tensorflow/stream_executor/fft.h"
+#include "tensorflow/stream_executor/host_or_device_scalar.h"
 #include "tensorflow/stream_executor/kernel.h"
 #include "tensorflow/stream_executor/launch_dim.h"
 #include "tensorflow/stream_executor/lib/array_slice.h"
@@ -1422,50 +1423,53 @@ class Stream {
   // See BlasSupport::DoBlasGemmWithAlgorithm.
   Stream &ThenBlasGemmWithAlgorithm(
       blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
-      uint64 k, const Eigen::half &alpha, const DeviceMemory<Eigen::half> &a,
-      int lda, const DeviceMemory<Eigen::half> &b, int ldb,
-      const Eigen::half &beta, DeviceMemory<Eigen::half> *c, int ldc,
-      blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+      uint64 k, const HostOrDeviceScalar<Eigen::half> &alpha,
+      const DeviceMemory<Eigen::half> &a, int lda,
+      const DeviceMemory<Eigen::half> &b, int ldb,
+      const HostOrDeviceScalar<Eigen::half> &beta, DeviceMemory<Eigen::half> *c,
+      int ldc, blas::ComputationType computation_type,
+      blas::AlgorithmType algorithm,
       blas::ProfileResult *output_profile_result);
-  Stream &ThenBlasGemmWithAlgorithm(blas::Transpose transa,
-                                    blas::Transpose transb, uint64 m, uint64 n,
-                                    uint64 k, int alpha,
-                                    const DeviceMemory<int8> &a, int lda,
-                                    const DeviceMemory<int8> &b, int ldb,
-                                    int beta, DeviceMemory<int> *c, int ldc,
-                                    blas::ComputationType computation_type,
-                                    blas::AlgorithmType algorithm,
-                                    blas::ProfileResult *output_profile_result);
-  Stream &ThenBlasGemmWithAlgorithm(blas::Transpose transa,
-                                    blas::Transpose transb, uint64 m, uint64 n,
-                                    uint64 k, float alpha,
-                                    const DeviceMemory<float> &a, int lda,
-                                    const DeviceMemory<float> &b, int ldb,
-                                    float beta, DeviceMemory<float> *c, int ldc,
-                                    blas::ComputationType computation_type,
-                                    blas::AlgorithmType algorithm,
-                                    blas::ProfileResult *output_profile_result);
   Stream &ThenBlasGemmWithAlgorithm(
       blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
-      uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
-      const DeviceMemory<double> &b, int ldb, double beta,
-      DeviceMemory<double> *c, int ldc, blas::ComputationType computation_type,
+      uint64 k, const HostOrDeviceScalar<int> &alpha,
+      const DeviceMemory<int8> &a, int lda, const DeviceMemory<int8> &b,
+      int ldb, const HostOrDeviceScalar<int> &beta, DeviceMemory<int> *c,
+      int ldc, blas::ComputationType computation_type,
       blas::AlgorithmType algorithm,
       blas::ProfileResult *output_profile_result);
   Stream &ThenBlasGemmWithAlgorithm(
       blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
-      uint64 k, std::complex<float> alpha,
+      uint64 k, const HostOrDeviceScalar<float> &alpha,
+      const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &b,
+      int ldb, const HostOrDeviceScalar<float> &beta, DeviceMemory<float> *c,
+      int ldc, blas::ComputationType computation_type,
+      blas::AlgorithmType algorithm,
+      blas::ProfileResult *output_profile_result);
+  Stream &ThenBlasGemmWithAlgorithm(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, const HostOrDeviceScalar<double> &alpha,
+      const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,
+      int ldb, const HostOrDeviceScalar<double> &beta, DeviceMemory<double> *c,
+      int ldc, blas::ComputationType computation_type,
+      blas::AlgorithmType algorithm,
+      blas::ProfileResult *output_profile_result);
+  Stream &ThenBlasGemmWithAlgorithm(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, const HostOrDeviceScalar<std::complex<float>> &alpha,
       const DeviceMemory<std::complex<float>> &a, int lda,
       const DeviceMemory<std::complex<float>> &b, int ldb,
-      std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+      const HostOrDeviceScalar<std::complex<float>> &beta,
+      DeviceMemory<std::complex<float>> *c, int ldc,
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,
       blas::ProfileResult *output_profile_result);
   Stream &ThenBlasGemmWithAlgorithm(
       blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
-      uint64 k, std::complex<double> alpha,
+      uint64 k, const HostOrDeviceScalar<std::complex<double>> &alpha,
       const DeviceMemory<std::complex<double>> &a, int lda,
       const DeviceMemory<std::complex<double>> &b, int ldb,
-      std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+      const HostOrDeviceScalar<std::complex<double>> &beta,
+      DeviceMemory<std::complex<double>> *c, int ldc,
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,
       blas::ProfileResult *output_profile_result);
 
-- 
GitLab


From f62c472c470aee64147df58de584f0b8450b29ad Mon Sep 17 00:00:00 2001
From: Ian Langmore <langmore@google.com>
Date: Tue, 24 Apr 2018 06:08:14 -0700
Subject: [PATCH 352/450] Move LinearOperatorCirculant to third_party.

PiperOrigin-RevId: 194075622
---
 tensorflow/contrib/linalg/__init__.py         |    4 +
 tensorflow/python/kernel_tests/linalg/BUILD   |   20 +
 .../linalg/linear_operator_circulant_test.py  |  700 +++++++++++
 tensorflow/python/ops/linalg/linalg.py        |    1 +
 .../ops/linalg/linear_operator_circulant.py   | 1074 +++++++++++++++++
 ...ear-operator-circulant.__metaclass__.pbtxt |   14 +
 ...ow.linalg.-linear-operator-circulant.pbtxt |  155 +++
 ...-operator-circulant2-d.__metaclass__.pbtxt |   14 +
 ...linalg.-linear-operator-circulant2-d.pbtxt |  155 +++
 ...-operator-circulant3-d.__metaclass__.pbtxt |   14 +
 ...linalg.-linear-operator-circulant3-d.pbtxt |  155 +++
 .../tools/api/golden/tensorflow.linalg.pbtxt  |   12 +
 12 files changed, 2318 insertions(+)
 create mode 100644 tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
 create mode 100644 tensorflow/python/ops/linalg/linear_operator_circulant.py
 create mode 100644 tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt
 create mode 100644 tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.pbtxt
 create mode 100644 tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt
 create mode 100644 tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
 create mode 100644 tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt
 create mode 100644 tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt

diff --git a/tensorflow/contrib/linalg/__init__.py b/tensorflow/contrib/linalg/__init__.py
index 38bd66b13f..554854da84 100644
--- a/tensorflow/contrib/linalg/__init__.py
+++ b/tensorflow/contrib/linalg/__init__.py
@@ -18,6 +18,9 @@ See the @{$python/contrib.linalg} guide.
 
 @@LinearOperator
 @@LinearOperatorBlockDiag
+@@LinearOperatorCirculant
+@@LinearOperatorCirculant2D
+@@LinearOperatorCirculant3D
 @@LinearOperatorDiag
 @@LinearOperatorIdentity
 @@LinearOperatorScaledIdentity
@@ -39,6 +42,7 @@ from tensorflow.contrib.linalg.python.ops.linear_operator_addition import *
 from tensorflow.contrib.linalg.python.ops.linear_operator_block_diag import *
 from tensorflow.contrib.linalg.python.ops.linear_operator_kronecker import *
 from tensorflow.python.ops.linalg.linear_operator import *
+from tensorflow.python.ops.linalg.linear_operator_circulant import *
 from tensorflow.python.ops.linalg.linear_operator_composition import *
 from tensorflow.python.ops.linalg.linear_operator_diag import *
 from tensorflow.python.ops.linalg.linear_operator_full_matrix import *
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index 7ffa48b653..faeccc8fba 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -43,6 +43,26 @@ cuda_py_test(
     tags = ["noasan"],  # times out b/63678675
 )
 
+cuda_py_test(
+    name = "linear_operator_circulant_test",
+    size = "medium",
+    srcs = ["linear_operator_circulant_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/linalg",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:spectral_ops_test_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    shard_count = 5,
+    tags = ["noasan"],  # times out b/63678675
+)
+
 cuda_py_test(
     name = "linear_operator_diag_test",
     size = "medium",
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
new file mode 100644
index 0000000000..e7f2f1c12b
--- /dev/null
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
@@ -0,0 +1,700 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.ops.linalg import linalg
+from tensorflow.python.ops.linalg import linear_operator_circulant
+from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.platform import test
+
+rng = np.random.RandomState(0)
+_to_complex = linear_operator_circulant._to_complex
+
+
+class LinearOperatorCirculantBaseTest(object):
+  """Common class for circulant tests."""
+
+  @contextlib.contextmanager
+  def test_session(self, *args, **kwargs):
+    with test.TestCase.test_session(self, *args, **kwargs) as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        yield sess
+
+  def _shape_to_spectrum_shape(self, shape):
+    # If spectrum.shape = batch_shape + [N],
+    # this creates an operator of shape batch_shape + [N, N]
+    return shape[:-1]
+
+  def _spectrum_to_circulant_1d(self, spectrum, shape, dtype):
+    """Creates a circulant matrix from a spectrum.
+
+    Intentionally done in an explicit yet inefficient way.  This provides a
+    cross check to the main code that uses fancy reshapes.
+
+    Args:
+      spectrum: Float or complex `Tensor`.
+      shape:  Python list.  Desired shape of returned matrix.
+      dtype:  Type to cast the returned matrix to.
+
+    Returns:
+      Circulant (batch) matrix of desired `dtype`.
+    """
+    spectrum = _to_complex(spectrum)
+    spectrum_shape = self._shape_to_spectrum_shape(shape)
+    domain_dimension = spectrum_shape[-1]
+    if not domain_dimension:
+      return array_ops.zeros(shape, dtype)
+
+    # Explicitly compute the action of spectrum on basis vectors.
+    matrix_rows = []
+    for m in range(domain_dimension):
+      x = np.zeros([domain_dimension])
+      # x is a basis vector.
+      x[m] = 1.0
+      fft_x = math_ops.fft(x)
+      h_convolve_x = math_ops.ifft(spectrum * fft_x)
+      matrix_rows.append(h_convolve_x)
+    matrix = array_ops.stack(matrix_rows, axis=-1)
+    return math_ops.cast(matrix, dtype)
+
+
+class LinearOperatorCirculantTestSelfAdjointOperator(
+    LinearOperatorCirculantBaseTest,
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Test of LinearOperatorCirculant when operator is self-adjoint.
+
+  Real spectrum <==> Self adjoint operator.
+  Note that when the spectrum is real, the operator may still be complex.
+  """
+
+  @property
+  def _dtypes_to_test(self):
+    # This operator will always be complex because, although the specturm is
+    # real, the matrix will not be real.
+    return [dtypes.complex64]
+
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
+    shape = build_info.shape
+    # For this test class, we are creating real spectrums.
+    # We also want the spectrum to have eigenvalues bounded away from zero.
+    #
+    # spectrum is bounded away from zero.
+    spectrum = linear_operator_test_util.random_sign_uniform(
+        shape=self._shape_to_spectrum_shape(shape), minval=1., maxval=2.)
+    # If dtype is complex, cast spectrum to complex.  The imaginary part will be
+    # zero, so the operator will still be self-adjoint.
+    spectrum = math_ops.cast(spectrum, dtype)
+
+    if use_placeholder:
+      spectrum_ph = array_ops.placeholder(dtypes.complex64)
+      # Evaluate here because (i) you cannot feed a tensor, and (ii)
+      # it is random and we want the same value used for both mat and feed_dict.
+      spectrum = spectrum.eval()
+      operator = linalg.LinearOperatorCirculant(
+          spectrum_ph, is_self_adjoint=True, input_output_dtype=dtype)
+      feed_dict = {spectrum_ph: spectrum}
+    else:
+      operator = linalg.LinearOperatorCirculant(
+          spectrum, is_self_adjoint=True, input_output_dtype=dtype)
+      feed_dict = None
+
+    mat = self._spectrum_to_circulant_1d(spectrum, shape, dtype=dtype)
+
+    return operator, mat, feed_dict
+
+  def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self):
+    with self.test_session():
+      spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64)
+      operator = linalg.LinearOperatorCirculant(
+          spectrum, input_output_dtype=dtypes.complex64)
+      matrix = operator.to_dense()
+      imag_matrix = math_ops.imag(matrix)
+      eps = np.finfo(np.float32).eps
+      np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3)
+
+
+class LinearOperatorCirculantTestHermitianSpectrum(
+    LinearOperatorCirculantBaseTest,
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Test of LinearOperatorCirculant when the spectrum is Hermitian.
+
+  Hermitian spectrum <==> Real valued operator.  We test both real and complex
+  dtypes here though.  So in some cases the matrix will be complex but with
+  zero imaginary part.
+  """
+
+  @property
+  def _dtypes_to_test(self):
+    return [dtypes.float32, dtypes.complex64]
+
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
+    shape = build_info.shape
+    # For this test class, we are creating Hermitian spectrums.
+    # We also want the spectrum to have eigenvalues bounded away from zero.
+    #
+    # pre_spectrum is bounded away from zero.
+    pre_spectrum = linear_operator_test_util.random_uniform(
+        shape=self._shape_to_spectrum_shape(shape), minval=1., maxval=2.)
+    pre_spectrum_c = _to_complex(pre_spectrum)
+
+    # Real{IFFT[pre_spectrum]}
+    #  = IFFT[EvenPartOf[pre_spectrum]]
+    # is the IFFT of something that is also bounded away from zero.
+    # Therefore, FFT[pre_h] would be a well-conditioned spectrum.
+    pre_h = math_ops.ifft(pre_spectrum_c)
+
+    # A spectrum is Hermitian iff it is the DFT of a real convolution kernel.
+    # So we will make spectrum = FFT[h], for real valued h.
+    h = math_ops.real(pre_h)
+    h_c = _to_complex(h)
+
+    spectrum = math_ops.fft(h_c)
+
+    if use_placeholder:
+      spectrum_ph = array_ops.placeholder(dtypes.complex64)
+      # Evaluate here because (i) you cannot feed a tensor, and (ii)
+      # it is random and we want the same value used for both mat and feed_dict.
+      spectrum = spectrum.eval()
+      operator = linalg.LinearOperatorCirculant(
+          spectrum_ph, input_output_dtype=dtype)
+      feed_dict = {spectrum_ph: spectrum}
+    else:
+      operator = linalg.LinearOperatorCirculant(
+          spectrum, input_output_dtype=dtype)
+      feed_dict = None
+
+    mat = self._spectrum_to_circulant_1d(spectrum, shape, dtype=dtype)
+
+    return operator, mat, feed_dict
+
+  def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self):
+    with self.test_session():
+      spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64)
+      operator = linalg.LinearOperatorCirculant(
+          spectrum, input_output_dtype=dtypes.complex64)
+      matrix = operator.to_dense()
+      imag_matrix = math_ops.imag(matrix)
+      eps = np.finfo(np.float32).eps
+      np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3)
+
+
+class LinearOperatorCirculantTestNonHermitianSpectrum(
+    LinearOperatorCirculantBaseTest,
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Test of LinearOperatorCirculant when the spectrum is not Hermitian.
+
+  Non-Hermitian spectrum <==> Complex valued operator.
+  We test only complex dtypes here.
+  """
+
+  @property
+  def _dtypes_to_test(self):
+    return [dtypes.complex64]
+
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
+    shape = build_info.shape
+    # Will be well conditioned enough to get accurate solves.
+    spectrum = linear_operator_test_util.random_sign_uniform(
+        shape=self._shape_to_spectrum_shape(shape),
+        dtype=dtypes.complex64,
+        minval=1.,
+        maxval=2.)
+
+    if use_placeholder:
+      spectrum_ph = array_ops.placeholder(dtypes.complex64)
+      # Evaluate here because (i) you cannot feed a tensor, and (ii)
+      # it is random and we want the same value used for both mat and feed_dict.
+      spectrum = spectrum.eval()
+      operator = linalg.LinearOperatorCirculant(
+          spectrum_ph, input_output_dtype=dtype)
+      feed_dict = {spectrum_ph: spectrum}
+    else:
+      operator = linalg.LinearOperatorCirculant(
+          spectrum, input_output_dtype=dtype)
+      feed_dict = None
+
+    mat = self._spectrum_to_circulant_1d(spectrum, shape, dtype=dtype)
+
+    return operator, mat, feed_dict
+
+  def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self):
+    with self.test_session():
+      spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64)
+      operator = linalg.LinearOperatorCirculant(
+          spectrum, input_output_dtype=dtypes.complex64)
+      matrix = operator.to_dense()
+      imag_matrix = math_ops.imag(matrix)
+      eps = np.finfo(np.float32).eps
+      np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3)
+
+  def test_simple_positive_real_spectrum_gives_self_adjoint_pos_def_oper(self):
+    with self.test_session() as sess:
+      spectrum = math_ops.cast([6., 4, 2], dtypes.complex64)
+      operator = linalg.LinearOperatorCirculant(
+          spectrum, input_output_dtype=dtypes.complex64)
+      matrix, matrix_h = sess.run(
+          [operator.to_dense(),
+           linalg.adjoint(operator.to_dense())])
+      self.assertAllClose(matrix, matrix_h)
+      operator.assert_positive_definite().run()  # Should not fail
+      operator.assert_self_adjoint().run()  # Should not fail
+
+  def test_defining_operator_using_real_convolution_kernel(self):
+    with self.test_session():
+      convolution_kernel = [1., 2., 1.]
+      spectrum = math_ops.fft(
+          math_ops.cast(convolution_kernel, dtypes.complex64))
+
+      # spectrum is shape [3] ==> operator is shape [3, 3]
+      # spectrum is Hermitian ==> operator is real.
+      operator = linalg.LinearOperatorCirculant(spectrum)
+
+      # Allow for complex output so we can make sure it has zero imag part.
+      self.assertEqual(operator.dtype, dtypes.complex64)
+
+      matrix = operator.to_dense().eval()
+      np.testing.assert_allclose(0, np.imag(matrix), atol=1e-6)
+
+  def test_hermitian_spectrum_gives_operator_with_zero_imag_part(self):
+    with self.test_session():
+      # Make spectrum the FFT of a real convolution kernel h.  This ensures that
+      # spectrum is Hermitian.
+      h = linear_operator_test_util.random_normal(shape=(3, 4))
+      spectrum = math_ops.fft(math_ops.cast(h, dtypes.complex64))
+      operator = linalg.LinearOperatorCirculant(
+          spectrum, input_output_dtype=dtypes.complex64)
+      matrix = operator.to_dense()
+      imag_matrix = math_ops.imag(matrix)
+      eps = np.finfo(np.float32).eps
+      np.testing.assert_allclose(
+          0, imag_matrix.eval(), rtol=0, atol=eps * 3 * 4)
+
+  def test_convolution_kernel_same_as_first_row_of_to_dense(self):
+    spectrum = [[3., 2., 1.], [2., 1.5, 1.]]
+    with self.test_session():
+      operator = linalg.LinearOperatorCirculant(spectrum)
+      h = operator.convolution_kernel()
+      c = operator.to_dense()
+
+      self.assertAllEqual((2, 3), h.get_shape())
+      self.assertAllEqual((2, 3, 3), c.get_shape())
+      self.assertAllClose(h.eval(), c.eval()[:, :, 0])
+
+  def test_assert_non_singular_fails_for_singular_operator(self):
+    spectrum = math_ops.cast([0, 4, 2j + 2], dtypes.complex64)
+    operator = linalg.LinearOperatorCirculant(spectrum)
+    with self.test_session():
+      with self.assertRaisesOpError("Singular operator"):
+        operator.assert_non_singular().run()
+
+  def test_assert_non_singular_does_not_fail_for_non_singular_operator(self):
+    spectrum = math_ops.cast([-3j, 4, 2j + 2], dtypes.complex64)
+    operator = linalg.LinearOperatorCirculant(spectrum)
+    with self.test_session():
+      operator.assert_non_singular().run()  # Should not fail
+
+  def test_assert_positive_definite_fails_for_non_positive_definite(self):
+    spectrum = math_ops.cast([6., 4, 2j], dtypes.complex64)
+    operator = linalg.LinearOperatorCirculant(spectrum)
+    with self.test_session():
+      with self.assertRaisesOpError("Not positive definite"):
+        operator.assert_positive_definite().run()
+
+  def test_assert_positive_definite_does_not_fail_when_pos_def(self):
+    spectrum = math_ops.cast([6., 4, 2j + 2], dtypes.complex64)
+    operator = linalg.LinearOperatorCirculant(spectrum)
+    with self.test_session():
+      operator.assert_positive_definite().run()  # Should not fail
+
+  def test_real_spectrum_and_not_self_adjoint_hint_raises(self):
+    spectrum = [1., 2.]
+    with self.assertRaisesRegexp(ValueError, "real.*always.*self-adjoint"):
+      linalg.LinearOperatorCirculant(spectrum, is_self_adjoint=False)
+
+  def test_real_spectrum_auto_sets_is_self_adjoint_to_true(self):
+    spectrum = [1., 2.]
+    operator = linalg.LinearOperatorCirculant(spectrum)
+    self.assertTrue(operator.is_self_adjoint)
+
+
+class LinearOperatorCirculant2DBaseTest(object):
+  """Common class for 2D circulant tests."""
+
+  @contextlib.contextmanager
+  def test_session(self, *args, **kwargs):
+    with test.TestCase.test_session(self, *args, **kwargs) as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        yield sess
+
+  @property
+  def _operator_build_infos(self):
+    build_info = linear_operator_test_util.OperatorBuildInfo
+    # non-batch operators (n, n) and batch operators.
+    return [
+        build_info((0, 0)),
+        build_info((1, 1)),
+        build_info((1, 6, 6)),
+        build_info((3, 4, 4)),
+        build_info((2, 1, 3, 3))
+    ]
+
+  def _shape_to_spectrum_shape(self, shape):
+    """Get a spectrum shape that will make an operator of desired shape."""
+    # This 2D block circulant operator takes a spectrum of shape
+    # batch_shape + [N0, N1],
+    # and creates and operator of shape
+    # batch_shape + [N0*N1, N0*N1]
+    if shape == (0, 0):
+      return (0, 0)
+    elif shape == (1, 1):
+      return (1, 1)
+    elif shape == (1, 6, 6):
+      return (1, 2, 3)
+    elif shape == (3, 4, 4):
+      return (3, 2, 2)
+    elif shape == (2, 1, 3, 3):
+      return (2, 1, 3, 1)
+    else:
+      raise ValueError("Unhandled shape: %s" % shape)
+
+  def _spectrum_to_circulant_2d(self, spectrum, shape, dtype):
+    """Creates a block circulant matrix from a spectrum.
+
+    Intentionally done in an explicit yet inefficient way.  This provides a
+    cross check to the main code that uses fancy reshapes.
+
+    Args:
+      spectrum: Float or complex `Tensor`.
+      shape:  Python list.  Desired shape of returned matrix.
+      dtype:  Type to cast the returned matrix to.
+
+    Returns:
+      Block circulant (batch) matrix of desired `dtype`.
+    """
+    spectrum = _to_complex(spectrum)
+    spectrum_shape = self._shape_to_spectrum_shape(shape)
+    domain_dimension = spectrum_shape[-1]
+    if not domain_dimension:
+      return array_ops.zeros(shape, dtype)
+
+    block_shape = spectrum_shape[-2:]
+
+    # Explicitly compute the action of spectrum on basis vectors.
+    matrix_rows = []
+    for n0 in range(block_shape[0]):
+      for n1 in range(block_shape[1]):
+        x = np.zeros(block_shape)
+        # x is a basis vector.
+        x[n0, n1] = 1.0
+        fft_x = math_ops.fft2d(x)
+        h_convolve_x = math_ops.ifft2d(spectrum * fft_x)
+        # We want the flat version of the action of the operator on a basis
+        # vector, not the block version.
+        h_convolve_x = array_ops.reshape(h_convolve_x, shape[:-1])
+        matrix_rows.append(h_convolve_x)
+    matrix = array_ops.stack(matrix_rows, axis=-1)
+    return math_ops.cast(matrix, dtype)
+
+
+class LinearOperatorCirculant2DTestHermitianSpectrum(
+    LinearOperatorCirculant2DBaseTest,
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Test of LinearOperatorCirculant2D when the spectrum is Hermitian.
+
+  Hermitian spectrum <==> Real valued operator.  We test both real and complex
+  dtypes here though.  So in some cases the matrix will be complex but with
+  zero imaginary part.
+  """
+
+  @property
+  def _dtypes_to_test(self):
+    return [dtypes.float32, dtypes.complex64]
+
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
+    shape = build_info.shape
+    # For this test class, we are creating Hermitian spectrums.
+    # We also want the spectrum to have eigenvalues bounded away from zero.
+    #
+    # pre_spectrum is bounded away from zero.
+    pre_spectrum = linear_operator_test_util.random_uniform(
+        shape=self._shape_to_spectrum_shape(shape), minval=1., maxval=2.)
+    pre_spectrum_c = _to_complex(pre_spectrum)
+
+    # Real{IFFT[pre_spectrum]}
+    #  = IFFT[EvenPartOf[pre_spectrum]]
+    # is the IFFT of something that is also bounded away from zero.
+    # Therefore, FFT[pre_h] would be a well-conditioned spectrum.
+    pre_h = math_ops.ifft2d(pre_spectrum_c)
+
+    # A spectrum is Hermitian iff it is the DFT of a real convolution kernel.
+    # So we will make spectrum = FFT[h], for real valued h.
+    h = math_ops.real(pre_h)
+    h_c = _to_complex(h)
+
+    spectrum = math_ops.fft2d(h_c)
+
+    if use_placeholder:
+      spectrum_ph = array_ops.placeholder(dtypes.complex64)
+      # Evaluate here because (i) you cannot feed a tensor, and (ii)
+      # it is random and we want the same value used for both mat and feed_dict.
+      spectrum = spectrum.eval()
+      operator = linalg.LinearOperatorCirculant2D(
+          spectrum_ph, input_output_dtype=dtype)
+      feed_dict = {spectrum_ph: spectrum}
+    else:
+      operator = linalg.LinearOperatorCirculant2D(
+          spectrum, input_output_dtype=dtype)
+      feed_dict = None
+
+    mat = self._spectrum_to_circulant_2d(spectrum, shape, dtype=dtype)
+
+    return operator, mat, feed_dict
+
+
+class LinearOperatorCirculant2DTestNonHermitianSpectrum(
+    LinearOperatorCirculant2DBaseTest,
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Test of LinearOperatorCirculant when the spectrum is not Hermitian.
+
+  Non-Hermitian spectrum <==> Complex valued operator.
+  We test only complex dtypes here.
+  """
+
+  @property
+  def _dtypes_to_test(self):
+    return [dtypes.complex64]
+
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
+    shape = build_info.shape
+    # Will be well conditioned enough to get accurate solves.
+    spectrum = linear_operator_test_util.random_sign_uniform(
+        shape=self._shape_to_spectrum_shape(shape),
+        dtype=dtype,
+        minval=1.,
+        maxval=2.)
+
+    if use_placeholder:
+      spectrum_ph = array_ops.placeholder(dtypes.complex64)
+      # Evaluate here because (i) you cannot feed a tensor, and (ii)
+      # it is random and we want the same value used for both mat and feed_dict.
+      spectrum = spectrum.eval()
+      operator = linalg.LinearOperatorCirculant2D(
+          spectrum_ph, input_output_dtype=dtype)
+      feed_dict = {spectrum_ph: spectrum}
+    else:
+      operator = linalg.LinearOperatorCirculant2D(
+          spectrum, input_output_dtype=dtype)
+      feed_dict = None
+
+    mat = self._spectrum_to_circulant_2d(spectrum, shape, dtype=dtype)
+
+    return operator, mat, feed_dict
+
+  def test_real_hermitian_spectrum_gives_real_symmetric_operator(self):
+    with self.test_session() as sess:
+      # This is a real and hermitian spectrum.
+      spectrum = [[1., 2., 2.], [3., 4., 4.], [3., 4., 4.]]
+      operator = linalg.LinearOperatorCirculant(spectrum)
+
+      matrix_tensor = operator.to_dense()
+      self.assertEqual(matrix_tensor.dtype,
+                       linear_operator_circulant._DTYPE_COMPLEX)
+      matrix_t = array_ops.matrix_transpose(matrix_tensor)
+      imag_matrix = math_ops.imag(matrix_tensor)
+      matrix, matrix_transpose, imag_matrix = sess.run(
+          [matrix_tensor, matrix_t, imag_matrix])
+
+      np.testing.assert_allclose(0, imag_matrix, atol=1e-6)
+      self.assertAllClose(matrix, matrix_transpose, atol=0)
+
+  def test_real_spectrum_gives_self_adjoint_operator(self):
+    with self.test_session() as sess:
+      # This is a real and hermitian spectrum.
+      spectrum = linear_operator_test_util.random_normal(
+          shape=(3, 3), dtype=dtypes.float32)
+      operator = linalg.LinearOperatorCirculant2D(spectrum)
+
+      matrix_tensor = operator.to_dense()
+      self.assertEqual(matrix_tensor.dtype,
+                       linear_operator_circulant._DTYPE_COMPLEX)
+      matrix_h = linalg.adjoint(matrix_tensor)
+      matrix, matrix_h = sess.run([matrix_tensor, matrix_h])
+      self.assertAllClose(matrix, matrix_h, atol=0)
+
+  def test_assert_non_singular_fails_for_singular_operator(self):
+    spectrum = math_ops.cast([[0, 4], [2j + 2, 3.]], dtypes.complex64)
+    operator = linalg.LinearOperatorCirculant2D(spectrum)
+    with self.test_session():
+      with self.assertRaisesOpError("Singular operator"):
+        operator.assert_non_singular().run()
+
+  def test_assert_non_singular_does_not_fail_for_non_singular_operator(self):
+    spectrum = math_ops.cast([[-3j, 4], [2j + 2, 3.]], dtypes.complex64)
+    operator = linalg.LinearOperatorCirculant2D(spectrum)
+    with self.test_session():
+      operator.assert_non_singular().run()  # Should not fail
+
+  def test_assert_positive_definite_fails_for_non_positive_definite(self):
+    spectrum = math_ops.cast([[6., 4], [2j, 3.]], dtypes.complex64)
+    operator = linalg.LinearOperatorCirculant2D(spectrum)
+    with self.test_session():
+      with self.assertRaisesOpError("Not positive definite"):
+        operator.assert_positive_definite().run()
+
+  def test_assert_positive_definite_does_not_fail_when_pos_def(self):
+    spectrum = math_ops.cast([[6., 4], [2j + 2, 3.]], dtypes.complex64)
+    operator = linalg.LinearOperatorCirculant2D(spectrum)
+    with self.test_session():
+      operator.assert_positive_definite().run()  # Should not fail
+
+  def test_real_spectrum_and_not_self_adjoint_hint_raises(self):
+    spectrum = [[1., 2.], [3., 4]]
+    with self.assertRaisesRegexp(ValueError, "real.*always.*self-adjoint"):
+      linalg.LinearOperatorCirculant2D(spectrum, is_self_adjoint=False)
+
+  def test_real_spectrum_auto_sets_is_self_adjoint_to_true(self):
+    spectrum = [[1., 2.], [3., 4]]
+    operator = linalg.LinearOperatorCirculant2D(spectrum)
+    self.assertTrue(operator.is_self_adjoint)
+
+  def test_invalid_dtype_raises(self):
+    spectrum = array_ops.constant(rng.rand(2, 2, 2))
+    with self.assertRaisesRegexp(TypeError, "must have dtype"):
+      linalg.LinearOperatorCirculant2D(spectrum)
+
+  def test_invalid_rank_raises(self):
+    spectrum = array_ops.constant(np.float32(rng.rand(2)))
+    with self.assertRaisesRegexp(ValueError, "must have at least 2 dimensions"):
+      linalg.LinearOperatorCirculant2D(spectrum)
+
+
+class LinearOperatorCirculant3DTest(test.TestCase):
+  """Simple test of the 3D case.  See also the 1D and 2D tests."""
+
+  @contextlib.contextmanager
+  def test_session(self, *args, **kwargs):
+    with test.TestCase.test_session(self, *args, **kwargs) as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        yield sess
+
+  def test_real_spectrum_gives_self_adjoint_operator(self):
+    with self.test_session() as sess:
+      # This is a real and hermitian spectrum.
+      spectrum = linear_operator_test_util.random_normal(
+          shape=(2, 2, 3, 5), dtype=dtypes.float32)
+      operator = linalg.LinearOperatorCirculant3D(spectrum)
+      self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), operator.shape)
+
+      matrix_tensor = operator.to_dense()
+      self.assertEqual(matrix_tensor.dtype,
+                       linear_operator_circulant._DTYPE_COMPLEX)
+      matrix_h = linalg.adjoint(matrix_tensor)
+
+      matrix, matrix_h = sess.run([matrix_tensor, matrix_h])
+      self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), matrix.shape)
+      self.assertAllClose(matrix, matrix_h)
+
+  def test_defining_operator_using_real_convolution_kernel(self):
+    with self.test_session():
+      convolution_kernel = linear_operator_test_util.random_normal(
+          shape=(2, 2, 3, 5), dtype=dtypes.float32)
+      # Convolution kernel is real ==> spectrum is Hermitian.
+      spectrum = math_ops.fft3d(
+          math_ops.cast(convolution_kernel, dtypes.complex64))
+
+      # spectrum is Hermitian ==> operator is real.
+      operator = linalg.LinearOperatorCirculant3D(spectrum)
+      self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), operator.shape)
+
+      # Allow for complex output so we can make sure it has zero imag part.
+      self.assertEqual(operator.dtype, dtypes.complex64)
+      matrix = operator.to_dense().eval()
+      self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), matrix.shape)
+      np.testing.assert_allclose(0, np.imag(matrix), atol=1e-6)
+
+  def test_defining_spd_operator_by_taking_real_part(self):
+    with self.test_session() as sess:
+      # S is real and positive.
+      s = linear_operator_test_util.random_uniform(
+          shape=(10, 2, 3, 4), dtype=dtypes.float32, minval=1., maxval=2.)
+
+      # Let S = S1 + S2, the Hermitian and anti-hermitian parts.
+      # S1 = 0.5 * (S + S^H), S2 = 0.5 * (S - S^H),
+      # where ^H is the Hermitian transpose of the function:
+      #    f(n0, n1, n2)^H := ComplexConjugate[f(N0-n0, N1-n1, N2-n2)].
+      # We want to isolate S1, since
+      #   S1 is Hermitian by construction
+      #   S1 is real since S is
+      #   S1 is positive since it is the sum of two positive kernels
+
+      # IDFT[S] = IDFT[S1] + IDFT[S2]
+      #         =      H1  +      H2
+      # where H1 is real since it is Hermitian,
+      # and H2 is imaginary since it is anti-Hermitian.
+      ifft_s = math_ops.ifft3d(math_ops.cast(s, dtypes.complex64))
+
+      # Throw away H2, keep H1.
+      real_ifft_s = math_ops.real(ifft_s)
+
+      # This is the perfect spectrum!
+      # spectrum = DFT[H1]
+      #          = S1,
+      fft_real_ifft_s = math_ops.fft3d(
+          math_ops.cast(real_ifft_s, dtypes.complex64))
+
+      # S1 is Hermitian ==> operator is real.
+      # S1 is real ==> operator is self-adjoint.
+      # S1 is positive ==> operator is positive-definite.
+      operator = linalg.LinearOperatorCirculant3D(fft_real_ifft_s)
+
+      # Allow for complex output so we can check operator has zero imag part.
+      self.assertEqual(operator.dtype, dtypes.complex64)
+      matrix, matrix_t = sess.run([
+          operator.to_dense(),
+          array_ops.matrix_transpose(operator.to_dense())
+      ])
+      operator.assert_positive_definite().run()  # Should not fail.
+      np.testing.assert_allclose(0, np.imag(matrix), atol=1e-6)
+      self.assertAllClose(matrix, matrix_t)
+
+      # Just to test the theory, get S2 as well.
+      # This should create an imaginary operator.
+      # S2 is anti-Hermitian ==> operator is imaginary.
+      # S2 is real ==> operator is self-adjoint.
+      imag_ifft_s = math_ops.imag(ifft_s)
+      fft_imag_ifft_s = math_ops.fft3d(
+          1j * math_ops.cast(imag_ifft_s, dtypes.complex64))
+      operator_imag = linalg.LinearOperatorCirculant3D(fft_imag_ifft_s)
+
+      matrix, matrix_h = sess.run([
+          operator_imag.to_dense(),
+          array_ops.matrix_transpose(math_ops.conj(operator_imag.to_dense()))
+      ])
+      self.assertAllClose(matrix, matrix_h)
+      np.testing.assert_allclose(0, np.real(matrix), atol=1e-7)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/linalg/linalg.py b/tensorflow/python/ops/linalg/linalg.py
index 14319025ff..d73c21cdc0 100644
--- a/tensorflow/python/ops/linalg/linalg.py
+++ b/tensorflow/python/ops/linalg/linalg.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 # pylint: disable=wildcard-import,unused-import
 from tensorflow.python.ops.linalg.linalg_impl import *
 from tensorflow.python.ops.linalg.linear_operator import *
+from tensorflow.python.ops.linalg.linear_operator_circulant import *
 from tensorflow.python.ops.linalg.linear_operator_composition import *
 from tensorflow.python.ops.linalg.linear_operator_diag import *
 from tensorflow.python.ops.linalg.linear_operator_full_matrix import *
diff --git a/tensorflow/python/ops/linalg/linear_operator_circulant.py b/tensorflow/python/ops/linalg/linear_operator_circulant.py
new file mode 100644
index 0000000000..c367ed25ad
--- /dev/null
+++ b/tensorflow/python/ops/linalg/linear_operator_circulant.py
@@ -0,0 +1,1074 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""`LinearOperator` coming from a [[nested] block] circulant matrix."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_util
+from tensorflow.python.util.tf_export import tf_export
+
+__all__ = [
+    "LinearOperatorCirculant",
+    "LinearOperatorCirculant2D",
+    "LinearOperatorCirculant3D",
+]
+
+# Different FFT Ops will be used for different block depths.
+_FFT_OP = {1: math_ops.fft, 2: math_ops.fft2d, 3: math_ops.fft3d}
+_IFFT_OP = {1: math_ops.ifft, 2: math_ops.ifft2d, 3: math_ops.ifft3d}
+
+# This is the only dtype allowed with fft ops.
+# TODO(langmore) Add other types once available.
+_DTYPE_COMPLEX = dtypes.complex64
+
+
+# TODO(langmore) Add transformations that create common spectrums, e.g.
+#   starting with the convolution kernel
+#   start with half a spectrum, and create a Hermitian one.
+#   common filters.
+# TODO(langmore) Support rectangular Toeplitz matrices.
+class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
+  """Base class for circulant operators.  Not user facing.
+
+  `LinearOperator` acting like a [batch] [[nested] block] circulant matrix.
+  """
+
+  def __init__(self,
+               spectrum,
+               block_depth,
+               input_output_dtype=_DTYPE_COMPLEX,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=True,
+               name="LinearOperatorCirculant"):
+    r"""Initialize an `_BaseLinearOperatorCirculant`.
+
+    Args:
+      spectrum:  Shape `[B1,...,Bb, N]` `Tensor`.  Allowed dtypes are
+        `float32`, `complex64`.  Type can be different than `input_output_dtype`
+      block_depth:  Python integer, either 1, 2, or 3.  Will be 1 for circulant,
+        2 for block circulant, and 3 for nested block circulant.
+      input_output_dtype: `dtype` for input/output.  Must be either
+        `float32` or `complex64`.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.  If `spectrum` is real, this will always be true.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix\
+            #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      name:  A name to prepend to all ops created by this class.
+
+    Raises:
+      ValueError:  If `block_depth` is not an allowed value.
+      TypeError:  If `spectrum` is not an allowed type.
+    """
+
+    allowed_block_depths = [1, 2, 3]
+
+    self._name = name
+
+    if block_depth not in allowed_block_depths:
+      raise ValueError("Expected block_depth to be in %s.  Found: %s." %
+                       (allowed_block_depths, block_depth))
+    self._block_depth = block_depth
+
+    with ops.name_scope(name, values=[spectrum]):
+      self._spectrum = self._check_spectrum_and_return_tensor(spectrum)
+
+      # Check and auto-set hints.
+      if not self.spectrum.dtype.is_complex:
+        if is_self_adjoint is False:
+          raise ValueError(
+              "A real spectrum always corresponds to a self-adjoint operator.")
+        is_self_adjoint = True
+
+      if is_square is False:
+        raise ValueError(
+            "A [[nested] block] circulant operator is always square.")
+      is_square = True
+
+      # If spectrum.shape = [s0, s1, s2], and block_depth = 2,
+      # block_shape = [s1, s2]
+      s_shape = array_ops.shape(self.spectrum)
+      self._block_shape_tensor = s_shape[-self.block_depth:]
+
+      # Add common variants of spectrum to the graph.
+      self._spectrum_complex = _to_complex(self.spectrum)
+      self._abs_spectrum = math_ops.abs(self.spectrum)
+      self._conj_spectrum = math_ops.conj(self._spectrum_complex)
+
+      super(_BaseLinearOperatorCirculant, self).__init__(
+          dtype=dtypes.as_dtype(input_output_dtype),
+          graph_parents=[self.spectrum],
+          is_non_singular=is_non_singular,
+          is_self_adjoint=is_self_adjoint,
+          is_positive_definite=is_positive_definite,
+          is_square=is_square,
+          name=name)
+
+  def _check_spectrum_and_return_tensor(self, spectrum):
+    """Static check of spectrum.  Then return `Tensor` version."""
+    spectrum = ops.convert_to_tensor(spectrum, name="spectrum")
+
+    allowed_dtypes = [dtypes.float32, dtypes.complex64]
+    if spectrum.dtype not in allowed_dtypes:
+      raise TypeError("Argument spectrum must have dtype in %s.  Found: %s" %
+                      (allowed_dtypes, spectrum.dtype))
+    if spectrum.get_shape().ndims is not None:
+      if spectrum.get_shape().ndims < self.block_depth:
+        raise ValueError(
+            "Argument spectrum must have at least %d dimensions.  Found: %s" %
+            (self.block_depth, spectrum))
+    return spectrum
+
+  @property
+  def block_depth(self):
+    """Depth of recursively defined circulant blocks defining this `Operator`.
+
+    With `A` the dense representation of this `Operator`,
+
+    `block_depth = 1` means `A` is symmetric circulant.  For example,
+
+    ```
+    A = |x y z y|
+        |y x y z|
+        |z y x y|
+        |y z y x|
+    ```
+
+    `block_depth = 2` means `A` is block symmetric circulant with symemtric
+    circulant blocks.  For example, with `X`, `Y`, `Z` symmetric circulant,
+
+    ```
+    A = |X Y Z Y|
+        |Y X Y Z|
+        |Z Y X Y|
+        |Y Z Y X|
+    ```
+
+    `block_depth = 3` means `A` is block symmetric circulant with block
+    symmetric circulant blocks.
+
+    Returns:
+      Python `integer`.
+    """
+    return self._block_depth
+
+  def block_shape_tensor(self):
+    """Shape of the block dimensions of `self.spectrum`."""
+    return self._block_shape_tensor
+
+  @property
+  def block_shape(self):
+    return self.spectrum.get_shape()[-self.block_depth:]
+
+  @property
+  def spectrum(self):
+    return self._spectrum
+
+  def _vectorize_then_blockify(self, matrix):
+    """Shape batch matrix to batch vector, then blockify trailing dimensions."""
+    # Suppose
+    #   matrix.shape = [m0, m1, m2, m3],
+    # and matrix is a matrix because the final two dimensions are matrix dims.
+    #   self.block_depth = 2,
+    #   self.block_shape = [b0, b1]  (note b0 * b1 = m2).
+    # We will reshape matrix to
+    #   [m3, m0, m1, b0, b1].
+
+    # Vectorize: Reshape to batch vector.
+    #   [m0, m1, m2, m3] --> [m3, m0, m1, m2]
+    # This is called "vectorize" because we have taken the final two matrix dims
+    # and turned this into a size m3 batch of vectors.
+    vec = distribution_util.rotate_transpose(matrix, shift=1)
+
+    # Blockify: Blockfy trailing dimensions.
+    #   [m3, m0, m1, m2] --> [m3, m0, m1, b0, b1]
+    if (vec.get_shape().is_fully_defined() and
+        self.block_shape.is_fully_defined()):
+      # vec_leading_shape = [m3, m0, m1],
+      # the parts of vec that will not be blockified.
+      vec_leading_shape = vec.get_shape()[:-1]
+      final_shape = vec_leading_shape.concatenate(self.block_shape)
+    else:
+      vec_leading_shape = array_ops.shape(vec)[:-1]
+      final_shape = array_ops.concat(
+          (vec_leading_shape, self.block_shape_tensor()), 0)
+    return array_ops.reshape(vec, final_shape)
+
+  def _unblockify_then_matricize(self, vec):
+    """Flatten the block dimensions then reshape to a batch matrix."""
+    # Suppose
+    #   vec.shape = [v0, v1, v2, v3],
+    #   self.block_depth = 2.
+    # Then
+    #   leading shape = [v0, v1]
+    #   block shape = [v2, v3].
+    # We will reshape vec to
+    #   [v1, v2*v3, v0].
+
+    # Un-blockify: Flatten block dimensions.  Reshape
+    #   [v0, v1, v2, v3] --> [v0, v1, v2*v3].
+    if vec.get_shape().is_fully_defined():
+      # vec_shape = [v0, v1, v2, v3]
+      vec_shape = vec.get_shape().as_list()
+      # vec_leading_shape = [v0, v1]
+      vec_leading_shape = vec_shape[:-self.block_depth]
+      # vec_block_shape = [v2, v3]
+      vec_block_shape = vec_shape[-self.block_depth:]
+      # flat_shape = [v0, v1, v2*v3]
+      flat_shape = vec_leading_shape + [np.prod(vec_block_shape)]
+    else:
+      vec_shape = array_ops.shape(vec)
+      vec_leading_shape = vec_shape[:-self.block_depth]
+      vec_block_shape = vec_shape[-self.block_depth:]
+      flat_shape = array_ops.concat(
+          (vec_leading_shape, [math_ops.reduce_prod(vec_block_shape)]), 0)
+    vec_flat = array_ops.reshape(vec, flat_shape)
+
+    # Matricize:  Reshape to batch matrix.
+    #   [v0, v1, v2*v3] --> [v1, v2*v3, v0],
+    # representing a shape [v1] batch of [v2*v3, v0] matrices.
+    matrix = distribution_util.rotate_transpose(vec_flat, shift=-1)
+    return matrix
+
+  def _fft(self, x):
+    """FFT along the last self.block_depth dimensions of x.
+
+    Args:
+      x: `Tensor` with floating or complex `dtype`.
+        Should be in the form returned by self._vectorize_then_blockify.
+
+    Returns:
+      `Tensor` with `dtype` `complex64`.
+    """
+    x_complex = _to_complex(x)
+    return _FFT_OP[self.block_depth](x_complex)
+
+  def _ifft(self, x):
+    """IFFT along the last self.block_depth dimensions of x.
+
+    Args:
+      x: `Tensor` with floating or complex dtype.  Should be in the form
+        returned by self._vectorize_then_blockify.
+
+    Returns:
+      `Tensor` with `dtype` `complex64`.
+    """
+    x_complex = _to_complex(x)
+    return _IFFT_OP[self.block_depth](x_complex)
+
+  def convolution_kernel(self, name="convolution_kernel"):
+    """Convolution kernel corresponding to `self.spectrum`.
+
+    The `D` dimensional DFT of this kernel is the frequency domain spectrum of
+    this operator.
+
+    Args:
+      name:  A name to give this `Op`.
+
+    Returns:
+      `Tensor` with `dtype` `self.dtype`.
+    """
+    with self._name_scope(name):
+      h = self._ifft(self._spectrum_complex)
+      return math_ops.cast(h, self.dtype)
+
+  def _shape(self):
+    s_shape = self._spectrum.get_shape()
+    # Suppose spectrum.shape = [a, b, c, d]
+    # block_depth = 2
+    # Then:
+    #   batch_shape = [a, b]
+    #   N = c*d
+    # and we want to return
+    #   [a, b, c*d, c*d]
+    batch_shape = s_shape[:-self.block_depth]
+    # trailing_dims = [c, d]
+    trailing_dims = s_shape[-self.block_depth:]
+    if trailing_dims.is_fully_defined():
+      n = np.prod(trailing_dims.as_list())
+    else:
+      n = None
+    n_x_n = tensor_shape.TensorShape([n, n])
+    return batch_shape.concatenate(n_x_n)
+
+  def _shape_tensor(self):
+    # See self.shape for explanation of steps
+    s_shape = array_ops.shape(self._spectrum)
+    batch_shape = s_shape[:-self.block_depth]
+    trailing_dims = s_shape[-self.block_depth:]
+    n = math_ops.reduce_prod(trailing_dims)
+    n_x_n = [n, n]
+    return array_ops.concat((batch_shape, n_x_n), 0)
+
+  def assert_hermitian_spectrum(self, name="assert_hermitian_spectrum"):
+    """Returns an `Op` that asserts this operator has Hermitian spectrum.
+
+    This operator corresponds to a real-valued matrix if and only if its
+    spectrum is Hermitian.
+
+    Args:
+      name:  A name to give this `Op`.
+
+    Returns:
+      An `Op` that asserts this operator has Hermitian spectrum.
+    """
+    eps = np.finfo(self.dtype.real_dtype.as_numpy_dtype).eps
+    with self._name_scope(name):
+      # Assume linear accumulation of error.
+      max_err = eps * self.domain_dimension_tensor()
+      imag_convolution_kernel = math_ops.imag(self.convolution_kernel())
+      return check_ops.assert_less(
+          math_ops.abs(imag_convolution_kernel),
+          max_err,
+          message="Spectrum was not Hermitian")
+
+  def _assert_non_singular(self):
+    return linear_operator_util.assert_no_entries_with_modulus_zero(
+        self.spectrum,
+        message="Singular operator:  Spectrum contained zero values.")
+
+  def _assert_positive_definite(self):
+    # This operator has the action  Ax = F^H D F x,
+    # where D is the diagonal matrix with self.spectrum on the diag.  Therefore,
+    # <x, Ax> = <Fx, DFx>,
+    # Since F is bijective, the condition for positive definite is the same as
+    # for a diagonal matrix, i.e. real part of spectrum is positive.
+    message = (
+        "Not positive definite:  Real part of spectrum was not all positive.")
+    return check_ops.assert_positive(
+        math_ops.real(self.spectrum), message=message)
+
+  def _assert_self_adjoint(self):
+    # Recall correspondence between symmetry and real transforms.  See docstring
+    return linear_operator_util.assert_zero_imag_part(
+        self.spectrum,
+        message=(
+            "Not self-adjoint:  The spectrum contained non-zero imaginary part."
+        ))
+
+  def _broadcast_batch_dims(self, x, spectrum):
+    """Broadcast batch dims of batch matrix `x` and spectrum."""
+    # spectrum.shape = batch_shape + block_shape
+    # First make spectrum a batch matrix with
+    #   spectrum.shape = batch_shape + [prod(block_shape), 1]
+    spec_mat = array_ops.reshape(
+        spectrum, array_ops.concat(
+            (self.batch_shape_tensor(), [-1, 1]), axis=0))
+    # Second, broadcast, possibly requiring an addition of array of zeros.
+    x, spec_mat = linear_operator_util.broadcast_matrix_batch_dims((x,
+                                                                    spec_mat))
+    # Third, put the block shape back into spectrum.
+    batch_shape = array_ops.shape(x)[:-2]
+    spectrum = array_ops.reshape(
+        spec_mat,
+        array_ops.concat((batch_shape, self.block_shape_tensor()), axis=0))
+
+    return x, spectrum
+
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    x = linalg.adjoint(x) if adjoint_arg else x
+    # With F the matrix of a DFT, and F^{-1}, F^H the inverse and Hermitian
+    # transpose, one can show that F^{-1} = F^{H} is the IDFT matrix.  Therefore
+    # matmul(x) = F^{-1} diag(spectrum) F x,
+    #           = F^{H} diag(spectrum) F x,
+    # so that
+    # matmul(x, adjoint=True) = F^{H} diag(conj(spectrum)) F x.
+    spectrum = self._conj_spectrum if adjoint else self._spectrum_complex
+
+    x, spectrum = self._broadcast_batch_dims(x, spectrum)
+
+    x_vb = self._vectorize_then_blockify(x)
+    fft_x_vb = self._fft(x_vb)
+    block_vector_result = self._ifft(spectrum * fft_x_vb)
+    y = self._unblockify_then_matricize(block_vector_result)
+
+    return math_ops.cast(y, self.dtype)
+
+  def _determinant(self):
+    reduction_indices = [-(i + 1) for i in range(self.block_depth)]
+    det = math_ops.reduce_prod(
+        self.spectrum, reduction_indices=reduction_indices)
+    return math_ops.cast(det, self.dtype)
+
+  def _log_abs_determinant(self):
+    reduction_indices = [-(i + 1) for i in range(self.block_depth)]
+    lad = math_ops.reduce_sum(
+        math_ops.log(self._abs_spectrum), reduction_indices=reduction_indices)
+    return math_ops.cast(lad, self.dtype)
+
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    rhs = linalg.adjoint(rhs) if adjoint_arg else rhs
+    spectrum = self._conj_spectrum if adjoint else self._spectrum_complex
+
+    rhs, spectrum = self._broadcast_batch_dims(rhs, spectrum)
+
+    rhs_vb = self._vectorize_then_blockify(rhs)
+    fft_rhs_vb = self._fft(rhs_vb)
+    solution_vb = self._ifft(fft_rhs_vb / spectrum)
+    x = self._unblockify_then_matricize(solution_vb)
+    return math_ops.cast(x, self.dtype)
+
+  def _diag_part(self):
+    # Get ones in shape of diag, which is [B1,...,Bb, N]
+    # Also get the size of the diag, "N".
+    if self.shape.is_fully_defined():
+      diag_shape = self.shape[:-1]
+      diag_size = self.domain_dimension.value
+    else:
+      diag_shape = self.shape_tensor()[:-1]
+      diag_size = self.domain_dimension_tensor()
+    ones_diag = array_ops.ones(diag_shape, dtype=self.dtype)
+
+    # As proved in comments in self._trace, the value on the diag is constant,
+    # repeated N times.  This value is the trace divided by N.
+
+    # The handling of self.shape = (0, 0) is tricky, and is the reason we choose
+    # to compute trace and use that to compute diag_part, rather than computing
+    # the value on the diagonal ("diag_value") directly.  Both result in a 0/0,
+    # but in different places, and the current method gives the right result in
+    # the end.
+
+    # Here, if self.shape = (0, 0), then self.trace() = 0., and then
+    # diag_value = 0. / 0. = NaN.
+    diag_value = self.trace() / math_ops.cast(diag_size, self.dtype)
+
+    # If self.shape = (0, 0), then ones_diag = [] (empty tensor), and then
+    # the following line is NaN * [] = [], as needed.
+    return diag_value[..., array_ops.newaxis] * ones_diag
+
+  def _trace(self):
+    # The diagonal of the [[nested] block] circulant operator is the mean of
+    # the spectrum.
+    # Proof:  For the [0,...,0] element, this follows from the IDFT formula.
+    # Then the result follows since all diagonal elements are the same.
+
+    # Therefore, the trace is the sum of the spectrum.
+
+    # Get shape of diag along with the axis over which to reduce the spectrum.
+    # We will reduce the spectrum over all block indices.
+    if self.spectrum.get_shape().is_fully_defined():
+      spec_rank = self.spectrum.get_shape().ndims
+      axis = np.arange(spec_rank - self.block_depth, spec_rank, dtype=np.int32)
+    else:
+      spec_rank = array_ops.rank(self.spectrum)
+      axis = math_ops.range(spec_rank - self.block_depth, spec_rank)
+
+    # Real diag part "re_d".
+    # Suppose spectrum.shape = [B1,...,Bb, N1, N2]
+    # self.shape = [B1,...,Bb, N, N], with N1 * N2 = N.
+    # re_d_value.shape = [B1,...,Bb]
+    re_d_value = math_ops.reduce_sum(math_ops.real(self.spectrum), axis=axis)
+
+    if not self.dtype.is_complex:
+      return math_ops.cast(re_d_value, self.dtype)
+
+    # Imaginary part, "im_d".
+    if self.is_self_adjoint:
+      im_d_value = 0.
+    else:
+      im_d_value = math_ops.reduce_sum(math_ops.imag(self.spectrum), axis=axis)
+
+    return math_ops.cast(math_ops.complex(re_d_value, im_d_value), self.dtype)
+
+
+@tf_export("linalg.LinearOperatorCirculant")
+class LinearOperatorCirculant(_BaseLinearOperatorCirculant):
+  """`LinearOperator` acting like a circulant matrix.
+
+  This operator acts like a circulant matrix `A` with
+  shape `[B1,...,Bb, N, N]` for some `b >= 0`.  The first `b` indices index a
+  batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
+  an `N x N` matrix.  This matrix `A` is not materialized, but for
+  purposes of broadcasting this shape will be relevant.
+
+  #### Description in terms of circulant matrices
+
+  Circulant means the entries of `A` are generated by a single vector, the
+  convolution kernel `h`: `A_{mn} := h_{m-n mod N}`.  With `h = [w, x, y, z]`,
+
+  ```
+  A = |w z y x|
+      |x w z y|
+      |y x w z|
+      |z y x w|
+  ```
+
+  This means that the result of matrix multiplication `v = Au` has `Lth` column
+  given circular convolution between `h` with the `Lth` column of `u`.
+
+  See http://ee.stanford.edu/~gray/toeplitz.pdf
+
+  #### Description in terms of the frequency spectrum
+
+  There is an equivalent description in terms of the [batch] spectrum `H` and
+  Fourier transforms.  Here we consider `A.shape = [N, N]` and ignore batch
+  dimensions.  Define the discrete Fourier transform (DFT) and its inverse by
+
+  ```
+  DFT[ h[n] ] = H[k] := sum_{n = 0}^{N - 1} h_n e^{-i 2pi k n / N}
+  IDFT[ H[k] ] = h[n] = N^{-1} sum_{k = 0}^{N - 1} H_k e^{i 2pi k n / N}
+  ```
+
+  From these definitions, we see that
+
+  ```
+  H[0] = sum_{n = 0}^{N - 1} h_n
+  H[1] = "the first positive frequency"
+  H[N - 1] = "the first negative frequency"
+  ```
+
+  Loosely speaking, with `*` element-wise multiplication, matrix multiplication
+  is equal to the action of a Fourier multiplier: `A u = IDFT[ H * DFT[u] ]`.
+  Precisely speaking, given `[N, R]` matrix `u`, let `DFT[u]` be the `[N, R]`
+  matrix with `rth` column equal to the DFT of the `rth` column of `u`.
+  Define the `IDFT` similarly.
+  Matrix multiplication may be expressed columnwise:
+
+  ```(A u)_r = IDFT[ H * (DFT[u])_r ]```
+
+  #### Operator properties deduced from the spectrum.
+
+  Letting `U` be the `kth` Euclidean basis vector, and `U = IDFT[u]`.
+  The above formulas show that`A U = H_k * U`.  We conclude that the elements
+  of `H` are the eigenvalues of this operator.   Therefore
+
+  * This operator is positive definite if and only if `Real{H} > 0`.
+
+  A general property of Fourier transforms is the correspondence between
+  Hermitian functions and real valued transforms.
+
+  Suppose `H.shape = [B1,...,Bb, N]`.  We say that `H` is a Hermitian spectrum
+  if, with `%` meaning modulus division,
+
+  ```H[..., n % N] = ComplexConjugate[ H[..., (-n) % N] ]```
+
+  * This operator corresponds to a real matrix if and only if `H` is Hermitian.
+  * This operator is self-adjoint if and only if `H` is real.
+
+  See e.g. "Discrete-Time Signal Processing", Oppenheim and Schafer.
+
+  #### Example of a self-adjoint positive definite operator
+
+  ```python
+  # spectrum is real ==> operator is self-adjoint
+  # spectrum is positive ==> operator is positive definite
+  spectrum = [6., 4, 2]
+
+  operator = LinearOperatorCirculant(spectrum)
+
+  # IFFT[spectrum]
+  operator.convolution_kernel()
+  ==> [4 + 0j, 1 + 0.58j, 1 - 0.58j]
+
+  operator.to_dense()
+  ==> [[4 + 0.0j, 1 - 0.6j, 1 + 0.6j],
+       [1 + 0.6j, 4 + 0.0j, 1 - 0.6j],
+       [1 - 0.6j, 1 + 0.6j, 4 + 0.0j]]
+  ```
+
+  #### Example of defining in terms of a real convolution kernel
+
+  ```python
+  # convolution_kernel is real ==> spectrum is Hermitian.
+  convolution_kernel = [1., 2., 1.]]
+  spectrum = tf.fft(tf.cast(convolution_kernel, tf.complex64))
+
+  # spectrum is Hermitian ==> operator is real.
+  # spectrum is shape [3] ==> operator is shape [3, 3]
+  # We force the input/output type to be real, which allows this to operate
+  # like a real matrix.
+  operator = LinearOperatorCirculant(spectrum, input_output_dtype=tf.float32)
+
+  operator.to_dense()
+  ==> [[ 1, 1, 2],
+       [ 2, 1, 1],
+       [ 1, 2, 1]]
+  ```
+
+  #### Example of Hermitian spectrum
+
+  ```python
+  # spectrum is shape [3] ==> operator is shape [3, 3]
+  # spectrum is Hermitian ==> operator is real.
+  spectrum = [1, 1j, -1j]
+
+  operator = LinearOperatorCirculant(spectrum)
+
+  operator.to_dense()
+  ==> [[ 0.33 + 0j,  0.91 + 0j, -0.24 + 0j],
+       [-0.24 + 0j,  0.33 + 0j,  0.91 + 0j],
+       [ 0.91 + 0j, -0.24 + 0j,  0.33 + 0j]
+  ```
+
+  #### Example of forcing real `dtype` when spectrum is Hermitian
+
+  ```python
+  # spectrum is shape [4] ==> operator is shape [4, 4]
+  # spectrum is real ==> operator is self-adjoint
+  # spectrum is Hermitian ==> operator is real
+  # spectrum has positive real part ==> operator is positive-definite.
+  spectrum = [6., 4, 2, 4]
+
+  # Force the input dtype to be float32.
+  # Cast the output to float32.  This is fine because the operator will be
+  # real due to Hermitian spectrum.
+  operator = LinearOperatorCirculant(spectrum, input_output_dtype=tf.float32)
+
+  operator.shape
+  ==> [4, 4]
+
+  operator.to_dense()
+  ==> [[4, 1, 0, 1],
+       [1, 4, 1, 0],
+       [0, 1, 4, 1],
+       [1, 0, 1, 4]]
+
+  # convolution_kernel = tf.ifft(spectrum)
+  operator.convolution_kernel()
+  ==> [4, 1, 0, 1]
+  ```
+
+  #### Performance
+
+  Suppose `operator` is a `LinearOperatorCirculant` of shape `[N, N]`,
+  and `x.shape = [N, R]`.  Then
+
+  * `operator.matmul(x)` is `O(R*N*Log[N])`
+  * `operator.solve(x)` is `O(R*N*Log[N])`
+  * `operator.determinant()` involves a size `N` `reduce_prod`.
+
+  If instead `operator` and `x` have shape `[B1,...,Bb, N, N]` and
+  `[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`.
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning:
+
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               spectrum,
+               input_output_dtype=_DTYPE_COMPLEX,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=True,
+               name="LinearOperatorCirculant"):
+    r"""Initialize an `LinearOperatorCirculant`.
+
+    This `LinearOperator` is initialized to have shape `[B1,...,Bb, N, N]`
+    by providing `spectrum`, a `[B1,...,Bb, N]` `Tensor`.
+
+    If `input_output_dtype = DTYPE`:
+
+    * Arguments to methods such as `matmul` or `solve` must be `DTYPE`.
+    * Values returned by all methods, such as `matmul` or `determinant` will be
+      cast to `DTYPE`.
+
+    Note that if the spectrum is not Hermitian, then this operator corresponds
+    to a complex matrix with non-zero imaginary part.  In this case, setting
+    `input_output_dtype` to a real type will forcibly cast the output to be
+    real, resulting in incorrect results!
+
+    If on the other hand the spectrum is Hermitian, then this operator
+    corresponds to a real-valued matrix, and setting `input_output_dtype` to
+    a real type is fine.
+
+    Args:
+      spectrum:  Shape `[B1,...,Bb, N]` `Tensor`.  Allowed dtypes are
+        `float32`, `complex64`.  Type can be different than `input_output_dtype`
+      input_output_dtype: `dtype` for input/output.  Must be either
+        `float32` or `complex64`.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.  If `spectrum` is real, this will always be true.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix\
+            #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      name:  A name to prepend to all ops created by this class.
+    """
+    super(LinearOperatorCirculant, self).__init__(
+        spectrum,
+        block_depth=1,
+        input_output_dtype=input_output_dtype,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name)
+
+
+@tf_export("linalg.LinearOperatorCirculant2D")
+class LinearOperatorCirculant2D(_BaseLinearOperatorCirculant):
+  """`LinearOperator` acting like a block circulant matrix.
+
+  This operator acts like a block circulant matrix `A` with
+  shape `[B1,...,Bb, N, N]` for some `b >= 0`.  The first `b` indices index a
+  batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
+  an `N x N` matrix.  This matrix `A` is not materialized, but for
+  purposes of broadcasting this shape will be relevant.
+
+  #### Description in terms of block circulant matrices
+
+  If `A` is block circulant, with block sizes `N0, N1` (`N0 * N1 = N`):
+  `A` has a block circulant structure, composed of `N0 x N0` blocks, with each
+  block an `N1 x N1` circulant matrix.
+
+  For example, with `W`, `X`, `Y`, `Z` each circulant,
+
+  ```
+  A = |W Z Y X|
+      |X W Z Y|
+      |Y X W Z|
+      |Z Y X W|
+  ```
+
+  Note that `A` itself will not in general be circulant.
+
+  #### Description in terms of the frequency spectrum
+
+  There is an equivalent description in terms of the [batch] spectrum `H` and
+  Fourier transforms.  Here we consider `A.shape = [N, N]` and ignore batch
+  dimensions.
+
+  If `H.shape = [N0, N1]`, (`N0 * N1 = N`):
+  Loosely speaking, matrix multiplication is equal to the action of a
+  Fourier multiplier:  `A u = IDFT2[ H DFT2[u] ]`.
+  Precisely speaking, given `[N, R]` matrix `u`, let `DFT2[u]` be the
+  `[N0, N1, R]` `Tensor` defined by re-shaping `u` to `[N0, N1, R]` and taking
+  a two dimensional DFT across the first two dimensions.  Let `IDFT2` be the
+  inverse of `DFT2`.  Matrix multiplication may be expressed columnwise:
+
+  ```(A u)_r = IDFT2[ H * (DFT2[u])_r ]```
+
+  #### Operator properties deduced from the spectrum.
+
+  * This operator is positive definite if and only if `Real{H} > 0`.
+
+  A general property of Fourier transforms is the correspondence between
+  Hermitian functions and real valued transforms.
+
+  Suppose `H.shape = [B1,...,Bb, N0, N1]`, we say that `H` is a Hermitian
+  spectrum if, with `%` indicating modulus division,
+
+  ```
+  H[..., n0 % N0, n1 % N1] = ComplexConjugate[ H[..., (-n0) % N0, (-n1) % N1 ].
+  ```
+
+  * This operator corresponds to a real matrix if and only if `H` is Hermitian.
+  * This operator is self-adjoint if and only if `H` is real.
+
+  See e.g. "Discrete-Time Signal Processing", Oppenheim and Schafer.
+
+  ### Example of a self-adjoint positive definite operator
+
+  ```python
+  # spectrum is real ==> operator is self-adjoint
+  # spectrum is positive ==> operator is positive definite
+  spectrum = [[1., 2., 3.],
+              [4., 5., 6.],
+              [7., 8., 9.]]
+
+  operator = LinearOperatorCirculant2D(spectrum)
+
+  # IFFT[spectrum]
+  operator.convolution_kernel()
+  ==> [[5.0+0.0j, -0.5-.3j, -0.5+.3j],
+       [-1.5-.9j,        0,        0],
+       [-1.5+.9j,        0,        0]]
+
+  operator.to_dense()
+  ==> Complex self adjoint 9 x 9 matrix.
+  ```
+
+  #### Example of defining in terms of a real convolution kernel,
+
+  ```python
+  # convolution_kernel is real ==> spectrum is Hermitian.
+  convolution_kernel = [[1., 2., 1.], [5., -1., 1.]]
+  spectrum = tf.fft2d(tf.cast(convolution_kernel, tf.complex64))
+
+  # spectrum is shape [2, 3] ==> operator is shape [6, 6]
+  # spectrum is Hermitian ==> operator is real.
+  operator = LinearOperatorCirculant2D(spectrum, input_output_dtype=tf.float32)
+  ```
+
+  #### Performance
+
+  Suppose `operator` is a `LinearOperatorCirculant` of shape `[N, N]`,
+  and `x.shape = [N, R]`.  Then
+
+  * `operator.matmul(x)` is `O(R*N*Log[N])`
+  * `operator.solve(x)` is `O(R*N*Log[N])`
+  * `operator.determinant()` involves a size `N` `reduce_prod`.
+
+  If instead `operator` and `x` have shape `[B1,...,Bb, N, N]` and
+  `[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`.
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               spectrum,
+               input_output_dtype=_DTYPE_COMPLEX,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=True,
+               name="LinearOperatorCirculant2D"):
+    r"""Initialize an `LinearOperatorCirculant2D`.
+
+    This `LinearOperator` is initialized to have shape `[B1,...,Bb, N, N]`
+    by providing `spectrum`, a `[B1,...,Bb, N0, N1]` `Tensor` with `N0*N1 = N`.
+
+    If `input_output_dtype = DTYPE`:
+
+    * Arguments to methods such as `matmul` or `solve` must be `DTYPE`.
+    * Values returned by all methods, such as `matmul` or `determinant` will be
+      cast to `DTYPE`.
+
+    Note that if the spectrum is not Hermitian, then this operator corresponds
+    to a complex matrix with non-zero imaginary part.  In this case, setting
+    `input_output_dtype` to a real type will forcibly cast the output to be
+    real, resulting in incorrect results!
+
+    If on the other hand the spectrum is Hermitian, then this operator
+    corresponds to a real-valued matrix, and setting `input_output_dtype` to
+    a real type is fine.
+
+    Args:
+      spectrum:  Shape `[B1,...,Bb, N]` `Tensor`.  Allowed dtypes are
+        `float32`, `complex64`.  Type can be different than `input_output_dtype`
+      input_output_dtype: `dtype` for input/output.  Must be either
+        `float32` or `complex64`.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.  If `spectrum` is real, this will always be true.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix\
+            #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      name:  A name to prepend to all ops created by this class.
+    """
+    super(LinearOperatorCirculant2D, self).__init__(
+        spectrum,
+        block_depth=2,
+        input_output_dtype=input_output_dtype,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name)
+
+
+@tf_export("linalg.LinearOperatorCirculant3D")
+class LinearOperatorCirculant3D(_BaseLinearOperatorCirculant):
+  """`LinearOperator` acting like a nested block circulant matrix.
+
+  This operator acts like a block circulant matrix `A` with
+  shape `[B1,...,Bb, N, N]` for some `b >= 0`.  The first `b` indices index a
+  batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
+  an `N x N` matrix.  This matrix `A` is not materialized, but for
+  purposes of broadcasting this shape will be relevant.
+
+  #### Description in terms of block circulant matrices
+
+  If `A` is nested block circulant, with block sizes `N0, N1, N2`
+  (`N0 * N1 * N2 = N`):
+  `A` has a block structure, composed of `N0 x N0` blocks, with each
+  block an `N1 x N1` block circulant matrix.
+
+  For example, with `W`, `X`, `Y`, `Z` each block circulant,
+
+  ```
+  A = |W Z Y X|
+      |X W Z Y|
+      |Y X W Z|
+      |Z Y X W|
+  ```
+
+  Note that `A` itself will not in general be circulant.
+
+  #### Description in terms of the frequency spectrum
+
+  There is an equivalent description in terms of the [batch] spectrum `H` and
+  Fourier transforms.  Here we consider `A.shape = [N, N]` and ignore batch
+  dimensions.
+
+  If `H.shape = [N0, N1, N2]`, (`N0 * N1 * N2 = N`):
+  Loosely speaking, matrix multiplication is equal to the action of a
+  Fourier multiplier:  `A u = IDFT3[ H DFT3[u] ]`.
+  Precisely speaking, given `[N, R]` matrix `u`, let `DFT3[u]` be the
+  `[N0, N1, N2, R]` `Tensor` defined by re-shaping `u` to `[N0, N1, N2, R]` and
+  taking a three dimensional DFT across the first three dimensions.  Let `IDFT3`
+  be the inverse of `DFT3`.  Matrix multiplication may be expressed columnwise:
+
+  ```(A u)_r = IDFT3[ H * (DFT3[u])_r ]```
+
+  #### Operator properties deduced from the spectrum.
+
+  * This operator is positive definite if and only if `Real{H} > 0`.
+
+  A general property of Fourier transforms is the correspondence between
+  Hermitian functions and real valued transforms.
+
+  Suppose `H.shape = [B1,...,Bb, N0, N1, N2]`, we say that `H` is a Hermitian
+  spectrum if, with `%` meaning modulus division,
+
+  ```
+  H[..., n0 % N0, n1 % N1, n2 % N2]
+    = ComplexConjugate[ H[..., (-n0) % N0, (-n1) % N1, (-n2) % N2] ].
+  ```
+
+  * This operator corresponds to a real matrix if and only if `H` is Hermitian.
+  * This operator is self-adjoint if and only if `H` is real.
+
+  See e.g. "Discrete-Time Signal Processing", Oppenheim and Schafer.
+
+  ### Examples
+
+  See `LinearOperatorCirculant` and `LinearOperatorCirculant2D` for examples.
+
+  #### Performance
+
+  Suppose `operator` is a `LinearOperatorCirculant` of shape `[N, N]`,
+  and `x.shape = [N, R]`.  Then
+
+  * `operator.matmul(x)` is `O(R*N*Log[N])`
+  * `operator.solve(x)` is `O(R*N*Log[N])`
+  * `operator.determinant()` involves a size `N` `reduce_prod`.
+
+  If instead `operator` and `x` have shape `[B1,...,Bb, N, N]` and
+  `[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`.
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               spectrum,
+               input_output_dtype=_DTYPE_COMPLEX,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=True,
+               name="LinearOperatorCirculant3D"):
+    """Initialize an `LinearOperatorCirculant`.
+
+    This `LinearOperator` is initialized to have shape `[B1,...,Bb, N, N]`
+    by providing `spectrum`, a `[B1,...,Bb, N0, N1, N2]` `Tensor`
+    with `N0*N1*N2 = N`.
+
+    If `input_output_dtype = DTYPE`:
+
+    * Arguments to methods such as `matmul` or `solve` must be `DTYPE`.
+    * Values returned by all methods, such as `matmul` or `determinant` will be
+      cast to `DTYPE`.
+
+    Note that if the spectrum is not Hermitian, then this operator corresponds
+    to a complex matrix with non-zero imaginary part.  In this case, setting
+    `input_output_dtype` to a real type will forcibly cast the output to be
+    real, resulting in incorrect results!
+
+    If on the other hand the spectrum is Hermitian, then this operator
+    corresponds to a real-valued matrix, and setting `input_output_dtype` to
+    a real type is fine.
+
+    Args:
+      spectrum:  Shape `[B1,...,Bb, N]` `Tensor`.  Allowed dtypes are
+        `float32`, `complex64`.  Type can be different than `input_output_dtype`
+      input_output_dtype: `dtype` for input/output.  Must be either
+        `float32` or `complex64`.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.  If `spectrum` is real, this will always be true.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the real part of all eigenvalues is positive.  We do not require
+        the operator to be self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix
+            #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      name:  A name to prepend to all ops created by this class.
+    """
+    super(LinearOperatorCirculant3D, self).__init__(
+        spectrum,
+        block_depth=3,
+        input_output_dtype=input_output_dtype,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name)
+
+
+def _to_complex(x):
+  return math_ops.cast(x, _DTYPE_COMPLEX)
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt
new file mode 100644
index 0000000000..3b33f3da97
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.linalg.LinearOperatorCirculant.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.pbtxt
new file mode 100644
index 0000000000..de917706d5
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -0,0 +1,155 @@
+path: "tensorflow.linalg.LinearOperatorCirculant"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant.LinearOperatorCirculant\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "block_depth"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "block_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "spectrum"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'spectrum\', \'input_output_dtype\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'complex64\'>\", \'None\', \'None\', \'None\', \'True\', \'LinearOperatorCirculant\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "assert_hermitian_spectrum"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "block_shape_tensor"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "convolution_kernel"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt
new file mode 100644
index 0000000000..591bc9631a
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.linalg.LinearOperatorCirculant2D.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
new file mode 100644
index 0000000000..c4e6a21c3a
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -0,0 +1,155 @@
+path: "tensorflow.linalg.LinearOperatorCirculant2D"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant.LinearOperatorCirculant2D\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "block_depth"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "block_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "spectrum"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'spectrum\', \'input_output_dtype\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'complex64\'>\", \'None\', \'None\', \'None\', \'True\', \'LinearOperatorCirculant2D\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "assert_hermitian_spectrum"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "block_shape_tensor"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "convolution_kernel"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt
new file mode 100644
index 0000000000..d643139a53
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.linalg.LinearOperatorCirculant3D.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
new file mode 100644
index 0000000000..2e085a8e28
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -0,0 +1,155 @@
+path: "tensorflow.linalg.LinearOperatorCirculant3D"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant.LinearOperatorCirculant3D\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "block_depth"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "block_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "spectrum"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'spectrum\', \'input_output_dtype\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'complex64\'>\", \'None\', \'None\', \'None\', \'True\', \'LinearOperatorCirculant3D\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "assert_hermitian_spectrum"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "block_shape_tensor"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "convolution_kernel"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
index 1d9c0c0f6d..7a5c533872 100644
--- a/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
@@ -4,6 +4,18 @@ tf_module {
     name: "LinearOperator"
     mtype: "<class \'abc.ABCMeta\'>"
   }
+  member {
+    name: "LinearOperatorCirculant"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
+  member {
+    name: "LinearOperatorCirculant2D"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
+  member {
+    name: "LinearOperatorCirculant3D"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
   member {
     name: "LinearOperatorComposition"
     mtype: "<class \'abc.ABCMeta\'>"
-- 
GitLab


From b9e12bc69df65eca279a90045d045e661fdb8108 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 06:24:43 -0700
Subject: [PATCH 353/450] Make tf.contrib.framework.zero_initializer work with
 ResourceVariable

PiperOrigin-RevId: 194077027
---
 tensorflow/contrib/framework/BUILD            |  1 +
 .../framework/kernels/zero_initializer_op.cc  | 71 +++++++++++++++++++
 .../contrib/framework/ops/variable_ops.cc     | 29 ++++++++
 .../contrib/framework/python/ops/variables.py |  8 ++-
 .../framework/python/ops/variables_test.py    | 26 +++++++
 5 files changed, 134 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index b1c8ad49ea..f675cc0cf0 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -93,6 +93,7 @@ tf_kernel_library(
     ],
     deps = [
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
     ],
     alwayslink = 1,
diff --git a/tensorflow/contrib/framework/kernels/zero_initializer_op.cc b/tensorflow/contrib/framework/kernels/zero_initializer_op.cc
index 5bf6b67529..6ab3f460b3 100644
--- a/tensorflow/contrib/framework/kernels/zero_initializer_op.cc
+++ b/tensorflow/contrib/framework/kernels/zero_initializer_op.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_var.h"
 
 namespace tensorflow {
 
@@ -85,4 +86,74 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 
 #undef REGISTER_KERNELS
 
+template <typename Device, typename T>
+class ZeroVarInitializer : public OpKernel {
+ public:
+  explicit ZeroVarInitializer(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &shape_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    Var* variable = nullptr;
+    OP_REQUIRES_OK(ctx, LookupOrCreateResource<Var>(
+                            ctx, HandleFromInput(ctx, 0), &variable,
+                            [this, ctx](Var** var_ptr) {
+                              *var_ptr = new Var(dtype_);
+                              PersistentTensor unused;
+                              Tensor* var_tensor = nullptr;
+                              AllocatorAttributes attr;
+                              attr.set_gpu_compatible(true);
+                              attr.set_nic_compatible(true);
+                              TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+                                  dtype_, shape_, &unused, &var_tensor, attr));
+
+                              functor::TensorSetZero<Device, T>()(
+                                  ctx->eigen_device<Device>(),
+                                  var_tensor->flat<T>());
+
+                              *(*var_ptr)->tensor() = *var_tensor;
+
+                              return Status::OK();
+                            }));
+
+    core::ScopedUnref scoped(variable);
+    mutex_lock ml(*variable->mu());
+
+    OP_REQUIRES(ctx, !variable->is_initialized,
+                errors::InvalidArgument("input is already initialized"));
+
+    variable->is_initialized = true;
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    output->scalar<ResourceHandle>()() = HandleFromInput(ctx, 0);
+  }
+
+ private:
+  DataType dtype_;
+  TensorShape shape_;
+};
+
+#define REGISTER_CPU_KERNELS(type)                            \
+  REGISTER_KERNEL_BUILDER(Name("ZeroVarInitializer")          \
+                              .Device(DEVICE_CPU)             \
+                              .TypeConstraint<type>("dtype"), \
+                          ZeroVarInitializer<Eigen::ThreadPoolDevice, type>);
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+#if GOOGLE_CUDA
+#define REGISTER_GPU_KERNELS(type)                           \
+  REGISTER_KERNEL_BUILDER(Name("ZeroVarInitializer")         \
+                              .Device(DEVICE_GPU)            \
+                              .TypeConstraint<type>("dtype") \
+                              .HostMemory("var"),            \
+                          ZeroVarInitializer<GPUDevice, type>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+#endif  // GOOGLE_CUDA
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/framework/ops/variable_ops.cc b/tensorflow/contrib/framework/ops/variable_ops.cc
index 706134ba9a..f6ee6cdb57 100644
--- a/tensorflow/contrib/framework/ops/variable_ops.cc
+++ b/tensorflow/contrib/framework/ops/variable_ops.cc
@@ -39,4 +39,33 @@ ref: Should be from a `Variable` node.
 output_ref:= Same as "ref".
 )doc");
 
+REGISTER_OP("ZeroVarInitializer")
+    .Input("var: resource")
+    .Output("output_var: resource")
+    .Attr("dtype: type")
+    .Attr("shape: shape")
+    .SetAllowsUninitializedInput()
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      DataType t;
+      TF_RETURN_IF_ERROR(c->GetAttr("dtype", &t));
+      PartialTensorShape p;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &p));
+      shape_inference::ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(p, &s));
+      c->set_output_handle_shapes_and_types(
+          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Initialize 'var' with all zeros. This op requires that the resource var is not
+initialized. The var will first be allocated memory, then be filled with all
+zeros. This op is intended to save memory during initialization,
+if you use this op, you should not run initializer of the var.
+
+var: Should be a ResourceVariable.
+output_var:= Same as "var".
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/framework/python/ops/variables.py b/tensorflow/contrib/framework/python/ops/variables.py
index 0754c3e0e3..40ae01bfcc 100644
--- a/tensorflow/contrib/framework/python/ops/variables.py
+++ b/tensorflow/contrib/framework/python/ops/variables.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import tf_logging as logging
@@ -82,7 +83,12 @@ def zero_initializer(ref, use_locking=True, name="zero_initializer"):
   """
   loader.load_op_library(
       resource_loader.get_path_to_datafile("_variable_ops.so"))
-  return gen_variable_ops.zero_initializer(ref, name=name)
+  if resource_variable_ops.is_resource_variable(ref):
+    return gen_variable_ops.zero_var_initializer(
+        ref.handle, shape=ref.shape, dtype=ref.dtype, name=name)
+  else:
+    return gen_variable_ops.zero_initializer(ref, name=name)
+
 
 @deprecated(None, "Please switch to tf.train.assert_global_step")
 def assert_global_step(global_step_tensor):
diff --git a/tensorflow/contrib/framework/python/ops/variables_test.py b/tensorflow/contrib/framework/python/ops/variables_test.py
index 2f06df93ac..37ea6eb12a 100644
--- a/tensorflow/contrib/framework/python/ops/variables_test.py
+++ b/tensorflow/contrib/framework/python/ops/variables_test.py
@@ -1284,6 +1284,32 @@ class ZeroInitializerOpTest(test.TestCase):
                 [10, 20], dtype=dtype), use_init)
 
 
+class ZeroVarInitializerOpTest(test.TestCase):
+
+  def _testZeroVarInitializer(self, shape, initializer, use_init):
+    var = resource_variable_ops.ResourceVariable(initializer)
+    var_zero = variables_lib2.zero_initializer(var)
+
+    with self.test_session() as sess:
+      with self.assertRaisesOpError('Error while reading resource variable'):
+        var.eval()
+      if use_init:
+        sess.run(var.initializer)
+        with self.assertRaisesOpError('input is already initialized'):
+          var_zero.eval()
+        self.assertAllClose(np.ones(shape), var.eval())
+      else:
+        var_zero.eval()
+        self.assertAllClose(np.zeros(shape), var.eval())
+
+  def testZeroVarInitializer(self):
+    for dtype in (dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64):
+      for use_init in (False, True):
+        self._testZeroVarInitializer([10, 20],
+                                     array_ops.ones([10, 20], dtype=dtype),
+                                     use_init)
+
+
 class FilterVariablesTest(test.TestCase):
 
   def setUp(self):
-- 
GitLab


From 5eb233d0686636a7bacc5b8813c079b6b9aa483c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 07:06:27 -0700
Subject: [PATCH 354/450] Introduce a new HLO shape and sharding matcher.

These new matchers can be used in tests in combination to the existing
HLO opcode matchers to better verify a generated HLO graph.

PiperOrigin-RevId: 194082100
---
 tensorflow/compiler/xla/service/BUILD         |  1 +
 .../compiler/xla/service/hlo_matchers.cc      | 63 +++++++++++++++++
 .../compiler/xla/service/hlo_matchers.h       | 69 +++++++++++++++++++
 .../compiler/xla/service/hlo_matchers_test.cc | 58 ++++++++++++++++
 4 files changed, 191 insertions(+)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index afb344e5ae..5edb9440c0 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -359,6 +359,7 @@ cc_library(
         ":hlo",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc
index bc74c4bc10..69deac263e 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers.cc
@@ -132,6 +132,69 @@ bool HloCustomCallMatcher::MatchAndExplain(
   return result;
 }
 
+bool HloShapeMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  if (ShapeUtil::Compatible(instruction->shape(), shape_)) {
+    return true;
+  }
+  *listener << instruction->ToString() << " has incorrect shape (expected: "
+            << ShapeUtil::HumanString(shape_) << ")";
+  return false;
+}
+
+void HloShapeMatcher::DescribeTo(std::ostream* os) const {
+  *os << ShapeUtil::HumanString(shape_);
+}
+
+bool HloShapeAndLayoutMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  if (ShapeUtil::Equal(instruction->shape(), shape_)) {
+    return true;
+  }
+  *listener << instruction->ToString() << " has incorrect shape (expected: "
+            << ShapeUtil::HumanStringWithLayout(shape_) << ")";
+  return false;
+}
+
+void HloShapeAndLayoutMatcher::DescribeTo(std::ostream* os) const {
+  *os << ShapeUtil::HumanStringWithLayout(shape_);
+}
+
+bool HloShardingMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  if (!sharding_.has_value()) {
+    if (!instruction->has_sharding()) {
+      return true;
+    }
+    *listener << instruction->ToString() << " expected to have no sharding.";
+    return false;
+  }
+  if (instruction->has_sharding()) {
+    if (instruction->sharding() == sharding_.value()) {
+      return true;
+    }
+    *listener << instruction->ToString()
+              << " has incorrect sharding (expected: " << sharding_->ToString()
+              << ")";
+    return false;
+  } else {
+    *listener << instruction->ToString()
+              << " has no sharding (expected: " << sharding_->ToString() << ")";
+    return false;
+  }
+}
+
+void HloShardingMatcher::DescribeTo(std::ostream* os) const {
+  if (sharding_.has_value()) {
+    *os << sharding_->ToString();
+  } else {
+    *os << "<no-sharding>";
+  }
+}
+
 }  // namespace testing
 
 void PrintTo(const HloInstruction* inst, ::std::ostream* os) {
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 103f04a2cb..f2ab9b5d9b 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 
 namespace xla {
 namespace testing {
@@ -86,6 +87,50 @@ class HloCustomCallMatcher : public HloMatcher {
   ::testing::Matcher<string> call_target_matcher_;
 };
 
+class HloShapeMatcher
+    : public ::testing::MatcherInterface<const HloInstruction*> {
+ public:
+  explicit HloShapeMatcher(const Shape& shape) : shape_(shape) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  Shape shape_;
+};
+
+class HloShapeAndLayoutMatcher
+    : public ::testing::MatcherInterface<const HloInstruction*> {
+ public:
+  explicit HloShapeAndLayoutMatcher(const Shape& shape) : shape_(shape) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  Shape shape_;
+};
+
+// Verify the sharding of an instruction against the provided HloSharding. If a
+// nullopt is provided for the expected sharding then it checks that no sharding
+// is present for an instruction.
+class HloShardingMatcher
+    : public ::testing::MatcherInterface<const HloInstruction*> {
+ public:
+  explicit HloShardingMatcher(
+      const tensorflow::gtl::optional<HloSharding>& sharding)
+      : sharding_(sharding) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  tensorflow::gtl::optional<HloSharding> sharding_;
+};
+
 // HloInstruction* matchers for opcode and operands. Example:
 //   namespace op = xla::opcode_matchers;
 //   EXPECT_THAT(instruction,
@@ -231,6 +276,30 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall() {
       new ::xla::testing::HloMatcher(HloOpcode::kCustomCall, {}));
 }
 
+// Verifies the shape or the shape and the layout of an HLO instruction against
+// the provided shape object.
+inline ::testing::Matcher<const ::xla::HloInstruction*> Shape(
+    const class Shape& shape) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloShapeMatcher(shape));
+}
+inline ::testing::Matcher<const ::xla::HloInstruction*> ShapeWithLayout(
+    const class Shape& shape) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloShapeAndLayoutMatcher(shape));
+}
+
+// Verifies the value of the HloSharing against the provided sharding object.
+inline ::testing::Matcher<const ::xla::HloInstruction*> Sharding(
+    const HloSharding& sharding) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloShardingMatcher(sharding));
+}
+// Verifies that no HloSharding is set for an HLO instruction.
+inline ::testing::Matcher<const ::xla::HloInstruction*> NoSharding() {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloShardingMatcher(tensorflow::gtl::nullopt));
+}
+
 #undef HLO_MATCHER
 }  // namespace opcode_matchers
 
diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
index 1c21703a45..c6373b2e46 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
@@ -100,5 +100,63 @@ TEST(HloMatchersTest, CustomCallMatcher) {
               R"(custom-call with call target that is equal to "foo_target")");
 }
 
+TEST(HloMatchersTest, ShapeMatcher) {
+  auto p0 = HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShapeWithLayout(F32, {5, 7}, {0, 1}), "param");
+
+  EXPECT_THAT(p0.get(), op::Shape(ShapeUtil::MakeShape(F32, {5, 7})));
+  EXPECT_THAT(
+      p0.get(),
+      ::testing::Not(op::ShapeWithLayout(ShapeUtil::MakeShape(F32, {5, 7}))));
+  EXPECT_THAT(p0.get(),
+              ::testing::Not(op::Shape(ShapeUtil::MakeShape(F32, {7, 5}))));
+  EXPECT_THAT(
+      p0.get(),
+      ::testing::Not(op::ShapeWithLayout(ShapeUtil::MakeShape(F32, {7, 5}))));
+  EXPECT_THAT(p0.get(),
+              op::Shape(ShapeUtil::MakeShapeWithLayout(F32, {5, 7}, {0, 1})));
+  EXPECT_THAT(p0.get(), op::ShapeWithLayout(ShapeUtil::MakeShapeWithLayout(
+                            F32, {5, 7}, {0, 1})));
+  EXPECT_THAT(p0.get(),
+              ::testing::Not(op::ShapeWithLayout(
+                  ShapeUtil::MakeShapeWithLayout(F32, {5, 7}, {1, 0}))));
+
+  EXPECT_THAT(Explain(p0.get(), op::Shape(ShapeUtil::MakeShape(F32, {7, 5}))),
+              "%param = f32[5,7]{0,1} parameter(0) has incorrect shape "
+              "(expected: f32[7,5])");
+  EXPECT_THAT(
+      Explain(p0.get(), op::ShapeWithLayout(ShapeUtil::MakeShapeWithLayout(
+                            F32, {7, 5}, {1, 0}))),
+      "%param = f32[5,7]{0,1} parameter(0) has incorrect shape "
+      "(expected: f32[7,5]{1,0})");
+}
+
+TEST(HloMatchersTest, ShardingMatcher) {
+  auto p0 = HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {5}),
+                                            "param.0");
+  p0->clear_sharding();
+  auto p1 = HloInstruction::CreateParameter(1, ShapeUtil::MakeShape(F32, {7}),
+                                            "param.1");
+  p1->set_sharding(HloSharding::AssignDevice(1));
+
+  EXPECT_THAT(p0.get(), op::NoSharding());
+  EXPECT_THAT(p0.get(),
+              ::testing::Not(op::Sharding(HloSharding::AssignDevice(1))));
+  EXPECT_THAT(p1.get(), ::testing::Not(op::NoSharding()));
+  EXPECT_THAT(p1.get(),
+              ::testing::Not(op::Sharding(HloSharding::AssignDevice(0))));
+  EXPECT_THAT(p1.get(), op::Sharding(HloSharding::AssignDevice(1)));
+
+  EXPECT_THAT(Explain(p0.get(), op::Sharding(HloSharding::AssignDevice(1))),
+              "%param.0 = f32[5]{0} parameter(0) has no sharding (expected: "
+              "{maximal device=1})");
+  EXPECT_THAT(Explain(p1.get(), op::NoSharding()),
+              "%param.1 = f32[7]{0} parameter(1), sharding={maximal device=1} "
+              "expected to have no sharding.");
+  EXPECT_THAT(Explain(p1.get(), op::Sharding(HloSharding::AssignDevice(0))),
+              "%param.1 = f32[7]{0} parameter(1), sharding={maximal device=1} "
+              "has incorrect sharding (expected: {maximal device=0})");
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From 1ce99cfa52b19a40cff8a9ae983a0a7f04eb2bf1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 07:38:49 -0700
Subject: [PATCH 355/450] Softens the requirements in the HLO sharding
 validation

The goal is to support tiled shardings where the last N tile have no data.

PiperOrigin-RevId: 194085302
---
 .../compiler/xla/service/hlo_sharding.cc      | 39 +++++++------------
 .../compiler/xla/service/hlo_sharding_test.cc | 15 ++-----
 2 files changed, 16 insertions(+), 38 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index 1b42349b0b..994de44123 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -256,37 +256,24 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
         ", input_shape=", ShapeUtil::HumanString(shape));
   }
 
-  // The tile shape must not be the same as the input shape without maximal_
-  // also set. If this is the case, we're not actually sharded and the correct
-  // constructor should have been used.
-  if (ShapeUtil::Equal(shape, tile_shape_)) {
+  // The correct constructor have to be used to create tile maximal shardings.
+  if (tile_assignment_.num_elements() == 1) {
     return tensorflow::errors::InvalidArgument(
-        "Tile shape is the same as the input shape. If a replicated sharding "
-        "was intended, use HloSharding::Replicated(). If a device placement "
-        "was intended, use HloSharding::AssignDevice()");
+        "Tile assignment only contains a single device. If a replicated "
+        "sharding was intended, use HloSharding::Replicated(). If a device "
+        "placement was intended, use HloSharding::AssignDevice()");
   }
 
-  // The tile shape must not be greater than the input shape in any dimension.
-  for (int64 i = 0, e = ShapeUtil::Rank(shape); i != e; ++i) {
-    auto tile_dim = tile_shape_.dimensions(i);
-    auto shape_dim = shape.dimensions(i);
-    if (tile_dim > shape_dim) {
-      return tensorflow::errors::InvalidArgument(
-          StrCat("Tile is larger than input shape (dimension ", i, ", ",
-                 tile_dim, " > ", shape_dim));
-    }
-  }
-
-  // The tile assignment tensor must be exactly dimensioned to ceil(shape[dim]
-  // tile[dim]) for every dimension contained within tile.
+  // The tile assignment tensor must contain enough element to cover the full
+  // shape with tiles of the specified size.
   for (int64 i = 0, e = tile_assignment_.dimensions().size(); i != e; ++i) {
-    int64 expected_dim =
-        CeilOfRatio(shape.dimensions(i), tile_shape_.dimensions(i));
-    if (tile_assignment_.dimensions()[i] != expected_dim) {
+    int64 total_tile_size = tile_assignment_.dim(i) * tile_shape_.dimensions(i);
+    if (shape.dimensions(i) > total_tile_size) {
       return tensorflow::errors::InvalidArgument(
-          StrCat("Tile assignment tensor has incorrect shape. Dimension ", i,
-                 " expected ", expected_dim, " but got ",
-                 tile_assignment_.dimensions()[i]));
+          StrCat("Tile assignment tensor has too few element to cover the full "
+                 "shape. Dimension ",
+                 i, ", shape ", shape.dimensions(i), ", total size ",
+                 total_tile_size));
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
index 69ea4233e4..3bf0d25efb 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
@@ -88,7 +88,7 @@ TEST_F(HloShardingTest, Tile) {
   }
 
   {
-    // Test should pass.
+    // Test should fail because of more devices used then `num_device`.
     Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3});
     HloSharding sharding =
         HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3}));
@@ -97,17 +97,8 @@ TEST_F(HloShardingTest, Tile) {
   }
 
   {
-    // Test should fail due to the tile being larger than the input space.
-    Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3});
-    HloSharding sharding =
-        HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3}));
-    EXPECT_IS_NOT_OK(sharding.Validate(ShapeUtil::MakeShape(F32, {2, 2}),
-                                       /*num_devices=*/4));
-  }
-
-  {
-    // Test should fail due to the tile not dividing the input space into 4
-    // sections (even with padding).
+    // Test should fail because the total tiled size in dimension 0 is 4 but we
+    // have 6 elements along that dimensions.
     Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3});
     HloSharding sharding =
         HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3}));
-- 
GitLab


From 38b531ddfb1e2fd0afd765710e4416fd555b98ae Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 09:11:15 -0700
Subject: [PATCH 356/450] Internal Change

PiperOrigin-RevId: 194096341
---
 tensorflow/core/BUILD                         | 74 ++++++++++++++++---
 .../core/platform/default/build_config.bzl    | 49 +++++++++++-
 tensorflow/tensorflow.bzl                     | 33 +++++++--
 tensorflow/tools/proto_text/BUILD             |  7 +-
 .../proto_text/gen_proto_text_functions.cc    |  6 +-
 5 files changed, 146 insertions(+), 23 deletions(-)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index ba1fd41565..843fd7b907 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -161,7 +161,7 @@ exports_files(["ops/ops.pbtxt"])
 # Note that some protos are in neither additional_core_proto_srcs nor this
 # filegroup; e.g.  ones with individual proto_library targets.
 # LINT.IfChange
-CORE_PROTO_SRCS = [
+COMMON_PROTO_SRCS = [
     "example/example.proto",
     "example/feature.proto",
     "framework/allocation_description.proto",
@@ -189,7 +189,6 @@ CORE_PROTO_SRCS = [
     "framework/types.proto",
     "framework/variable.proto",
     "framework/versions.proto",
-    "lib/core/error_codes.proto",
     "protobuf/config.proto",
     "protobuf/cluster.proto",
     "protobuf/debug.proto",
@@ -202,8 +201,14 @@ CORE_PROTO_SRCS = [
     "util/memmapped_file_system.proto",
     "util/saved_tensor_slice.proto",
 ]
+
+ERROR_CODES_PROTO_SRCS = [
+    "lib/core/error_codes.proto",
+]
 # LINT.ThenChange(//tensorflow/core/android_proto_config.asciipb)
 
+CORE_PROTO_SRCS = COMMON_PROTO_SRCS + ERROR_CODES_PROTO_SRCS
+
 # Protos which are not needed on mobile builds, but should be included in
 # protos_all.
 #
@@ -224,12 +229,16 @@ ADDITIONAL_CORE_PROTO_SRCS = [
 
 tf_proto_library(
     name = "protos_all",
-    srcs = CORE_PROTO_SRCS + ADDITIONAL_CORE_PROTO_SRCS,
+    srcs = [],
     cc_api_version = 2,
     default_header = True,
     j2objc_api_version = 1,
     java_api_version = 2,
     js_api_version = 2,
+    protodeps = [
+        ":protos_all_proto",
+        ":error_codes_proto",
+    ],
     visibility = ["//visibility:public"],
 )
 
@@ -1134,7 +1143,8 @@ filegroup(
 filegroup(
     name = "mobile_srcs_no_runtime",
     srcs = [
-        ":proto_text_srcs_all",
+        ":protos_all_proto_text_srcs",
+        ":error_codes_proto_text_srcs",
         "//tensorflow/core/platform/default/build_config:android_srcs",
     ] + glob(
         [
@@ -1930,15 +1940,58 @@ cc_library(
     ],
 )
 
-proto_text_hdrs_and_srcs = tf_generate_proto_text_sources(
-    name = "proto_text_srcs_all",
-    srcs = CORE_PROTO_SRCS,
+tf_proto_library(
+    name = "error_codes_proto",
+    srcs = ERROR_CODES_PROTO_SRCS,
+    cc_api_version = 2,
+    default_header = True,
+    j2objc_api_version = 1,
+    java_api_version = 2,
+    js_api_version = 2,
+)
+
+tf_generate_proto_text_sources(
+    name = "error_codes_proto_text",
+    srcs = ERROR_CODES_PROTO_SRCS,
+    protodeps = [],
+    srcs_relative_dir = "tensorflow/core/",
+    deps = [
+        ":error_codes_proto_cc",
+        ":lib_internal",
+    ],
+)
+
+tf_proto_library(
+    name = "protos_all_proto",
+    srcs = COMMON_PROTO_SRCS + ADDITIONAL_CORE_PROTO_SRCS,
+    cc_api_version = 2,
+    default_header = True,
+    j2objc_api_version = 1,
+    java_api_version = 2,
+    js_api_version = 2,
+    protodeps = [
+        ":error_codes_proto",
+    ],
+)
+
+tf_generate_proto_text_sources(
+    name = "protos_all_proto_text",
+    srcs = COMMON_PROTO_SRCS,
+    protodeps = ERROR_CODES_PROTO_SRCS,
     srcs_relative_dir = "tensorflow/core/",
+    deps = [
+        ":error_codes_proto_text",
+        ":lib_internal",
+        ":protos_all_proto_cc",
+    ],
 )
 
 cc_library(
     name = "proto_text",
-    hdrs = proto_text_hdrs_and_srcs.hdrs,
+    hdrs = [
+        ":error_codes_proto_text_hdrs",
+        ":protos_all_proto_text_hdrs",
+    ],
     deps = [
         ":lib",
         ":lib_internal",
@@ -2083,7 +2136,7 @@ tf_cuda_library(
             "util/memmapped_file_system.cc",
             "util/memmapped_file_system_writer.cc",
         ],
-    }) + proto_text_hdrs_and_srcs.srcs + tf_additional_framework_srcs(),
+    }) + tf_additional_framework_srcs(),
     hdrs = FRAMEWORK_INTERNAL_PUBLIC_HEADERS,
     copts = tf_copts(),
     linkopts = select({
@@ -2097,7 +2150,8 @@ tf_cuda_library(
     deps = [
         ":lib",
         ":lib_internal",
-        ":proto_text",
+        ":protos_all_proto_text",
+        ":error_codes_proto_text",
         ":protos_all_cc",
         ":version_lib",
         "//tensorflow/core/platform/default/build_config:platformlib",
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 44356e3438..ca0587e277 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -319,10 +319,34 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
   use_grpc_plugin = None
   if cc_grpc_version:
     use_grpc_plugin = True
+
+  cc_deps = tf_deps(protodeps, "_cc")
+  cc_name = name + "_cc"
+  if not srcs:
+    # This is a collection of sub-libraries. Build header-only and impl
+    # libraries containing all the sources.
+    proto_gen(
+        name = cc_name + "_genproto",
+        deps = [s + "_genproto" for s in cc_deps],
+        protoc = "@protobuf_archive//:protoc",
+        visibility=["//visibility:public"],
+    )
+    native.cc_library(
+        name = cc_name,
+        deps = cc_deps + ["@protobuf_archive//:protobuf_headers"] +
+               if_static([name + "_cc_impl"]),
+    )
+    native.cc_library(
+        name = cc_name + "_impl",
+        deps = [s + "_impl" for s in cc_deps] + ["@protobuf_archive//:cc_wkt_protos"],
+    )
+
+    return
+
   cc_proto_library(
-      name = name + "_cc",
+      name = cc_name,
       srcs = srcs,
-      deps = tf_deps(protodeps, "_cc") + ["@protobuf_archive//:cc_wkt_protos"],
+      deps = cc_deps + ["@protobuf_archive//:cc_wkt_protos"],
       cc_libs = cc_libs + if_static(
           ["@protobuf_archive//:protobuf"],
           ["@protobuf_archive//:protobuf_headers"]
@@ -341,11 +365,28 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
 
 def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[],
                         testonly=0, srcs_version="PY2AND3", use_grpc_plugin=False):
+  py_deps = tf_deps(protodeps, "_py")
+  py_name = name + "_py"
+  if not srcs:
+    # This is a collection of sub-libraries. Build header-only and impl
+    # libraries containing all the sources.
+    proto_gen(
+        name = py_name + "_genproto",
+        deps = [s + "_genproto" for s in py_deps],
+        protoc = "@protobuf_archive//:protoc",
+        visibility=["//visibility:public"],
+    )
+    native.py_library(
+        name = py_name,
+        deps = py_deps + ["@protobuf_archive//:protobuf_python"])
+
+    return
+
   py_proto_library(
-      name = name + "_py",
+      name = py_name,
       srcs = srcs,
       srcs_version = srcs_version,
-      deps = deps + tf_deps(protodeps, "_py") + ["@protobuf_archive//:protobuf_python"],
+      deps = deps + py_deps + ["@protobuf_archive//:protobuf_python"],
       protoc = "@protobuf_archive//:protoc",
       default_runtime = "@protobuf_archive//:protobuf_python",
       visibility = visibility,
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 51e856bed0..a9ddd4fc60 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -37,20 +37,25 @@ def src_to_test_name(src):
 def full_path(relative_paths):
   return [native.package_name() + "/" + relative for relative in relative_paths]
 
+def _add_tfcore_prefix(src):
+  if src.startswith("//"):
+    return src
+  return "//tensorflow/core:" + src
+
 # List of proto files for android builds
 def tf_android_core_proto_sources(core_proto_sources_relative):
   return [
-      "//tensorflow/core:" + p for p in core_proto_sources_relative
+      _add_tfcore_prefix(p) for p in core_proto_sources_relative
   ]
 
 # Returns the list of pb.h and proto.h headers that are generated for
 # tf_android_core_proto_sources().
 def tf_android_core_proto_headers(core_proto_sources_relative):
   return ([
-      "//tensorflow/core/" + p.replace(".proto", ".pb.h")
+      _add_tfcore_prefix(p).replace(":", "/").replace(".proto", ".pb.h")
       for p in core_proto_sources_relative
   ] + [
-      "//tensorflow/core/" + p.replace(".proto", ".proto.h")
+      _add_tfcore_prefix(p).replace(":", "/").replace(".proto", ".proto.h")
       for p in core_proto_sources_relative
   ])
 
@@ -1672,22 +1677,36 @@ def cuda_py_tests(name,
 #
 # Return a struct with fields (hdrs, srcs) containing the names of the
 # generated files.
-def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs):
+def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs, protodeps=[], deps=[], visibility=None):
   out_hdrs = (
       [p.replace(".proto", ".pb_text.h")
        for p in srcs] + [p.replace(".proto", ".pb_text-impl.h") for p in srcs])
   out_srcs = [p.replace(".proto", ".pb_text.cc") for p in srcs]
   native.genrule(
-      name=name,
-      srcs=srcs + [clean_dep("//tensorflow/tools/proto_text:placeholder.txt")],
+      name=name + "_srcs",
+      srcs=srcs + protodeps + [clean_dep("//tensorflow/tools/proto_text:placeholder.txt")],
       outs=out_hdrs + out_srcs,
+      visibility=visibility,
       cmd=
       "$(location //tensorflow/tools/proto_text:gen_proto_text_functions) "
       + "$(@D) " + srcs_relative_dir + " $(SRCS)",
       tools=[
           clean_dep("//tensorflow/tools/proto_text:gen_proto_text_functions")
       ],)
-  return struct(hdrs=out_hdrs, srcs=out_srcs)
+
+  native.filegroup(
+      name=name + "_hdrs",
+      srcs=out_hdrs,
+      visibility=visibility,
+  )
+
+  native.cc_library(
+      name=name,
+      srcs=out_srcs,
+      hdrs=out_hdrs,
+      visibility=visibility,
+      deps = deps,
+  )
 
 def tf_genrule_cmd_append_to_srcs(to_append):
   return ("cat $(SRCS) > $(@) && " + "echo >> $(@) && " + "echo " + to_append +
diff --git a/tensorflow/tools/proto_text/BUILD b/tensorflow/tools/proto_text/BUILD
index ef7bfdd3c9..31e8fb9120 100644
--- a/tensorflow/tools/proto_text/BUILD
+++ b/tensorflow/tools/proto_text/BUILD
@@ -75,9 +75,14 @@ tf_proto_library_cc(
 )
 
 tf_generate_proto_text_sources(
-    name = "test_proto_text_srcs",
+    name = "test_proto_text",
     srcs = ["test.proto"],
     srcs_relative_dir = "tensorflow/tools/proto_text/",
+    deps = [
+        ":test_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
 )
 
 tf_cc_test(
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions.cc b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
index f0bb59acf8..234afe879b 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
@@ -130,7 +130,11 @@ int MainImpl(int argc, char** argv) {
 
       const string path = output_root + "/" + proto_path_no_suffix + suffix;
       FILE* f = fopen(path.c_str(), "w");
-      if (f == nullptr) return -1;
+      if (f == nullptr) {
+        // We don't expect this output to be generated. It was specified in the
+        // list of sources solely to satisfy a proto import dependency.
+        continue;
+      }
       if (fwrite(data.c_str(), 1, data.size(), f) != data.size()) {
         fclose(f);
         return -1;
-- 
GitLab


From b7f957ceedb6f47e4d68c506389bff210c35ef6a Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Tue, 24 Apr 2018 09:15:07 -0700
Subject: [PATCH 357/450] Add S64 clamp test.

PiperOrigin-RevId: 194096814
---
 .../compiler/xla/tests/vector_ops_simple_test.cc      | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
index 697d78fe6e..8b86b5e760 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
@@ -348,6 +348,17 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) {
   ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
+XLA_TEST_F(VecOpsSimpleTest, ClampValuesConstantS64) {
+  ComputationBuilder builder(client_, TestName());
+  auto zero = builder.ConstantR0<int64>(0);
+  auto one = builder.ConstantR0<int64>(10);
+  auto x = builder.ConstantR1<int64>({-3, 3, 9, 13});
+  auto clamp = builder.Clamp(zero, x, one);
+
+  std::vector<int64> expected = {0, 3, 9, 10};
+  ComputeAndCompareR1<int64>(&builder, expected, {});
+}
+
 XLA_TEST_F(VecOpsSimpleTest, MapTenValues) {
   Computation add_half;
   {
-- 
GitLab


From cfedd67f5881ae3697638e9b74eccb7da9818a0e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 09:44:52 -0700
Subject: [PATCH 358/450] Add an attr to apply_adagrad op that allows it to
 skip updating the accumulators.

PiperOrigin-RevId: 194100678
---
 tensorflow/core/kernels/training_ops.cc       | 23 ++++++++++++++-----
 tensorflow/core/kernels/training_ops.h        |  2 +-
 .../core/kernels/training_ops_gpu.cu.cc       |  6 +++--
 tensorflow/core/ops/training_ops.cc           |  4 ++++
 4 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 5b13b10937..271329599f 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -153,8 +153,10 @@ struct ApplyAdagrad<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
                   typename TTypes<T>::Flat accum,
                   typename TTypes<T>::ConstScalar lr,
-                  typename TTypes<T>::ConstFlat grad) {
-    accum.device(d) += grad.square();
+                  typename TTypes<T>::ConstFlat grad, bool update_slots) {
+    if (update_slots) {
+      accum.device(d) += grad.square();
+    }
     var.device(d) -= grad * lr() * accum.rsqrt();
   }
 };
@@ -1074,6 +1076,7 @@ class ApplyAdagradOp : public OpKernel {
  public:
   explicit ApplyAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("update_slots", &update_slots_));
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -1111,13 +1114,15 @@ class ApplyAdagradOp : public OpKernel {
 
     const Device& device = ctx->template eigen_device<Device>();
     functor::ApplyAdagrad<Device, T>()(device, var.flat<T>(), accum.flat<T>(),
-                                       lr.scalar<T>(), grad.flat<T>());
+                                       lr.scalar<T>(), grad.flat<T>(),
+                                       update_slots_);
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
   bool use_exclusive_lock_;
+  bool update_slots_;
 };
 
 #define REGISTER_KERNELS(D, T)                                        \
@@ -1145,7 +1150,7 @@ namespace functor {
   void ApplyAdagrad<GPUDevice, T>::operator()(                            \
       const GPUDevice& d, typename TTypes<T>::Flat var,                   \
       typename TTypes<T>::Flat accum, typename TTypes<T>::ConstScalar lr, \
-      typename TTypes<T>::ConstFlat grad);                                \
+      typename TTypes<T>::ConstFlat grad, bool update_slots);             \
   extern template struct ApplyAdagrad<GPUDevice, T>;
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
@@ -1266,6 +1271,7 @@ class SparseApplyAdagradOp : public OpKernel {
  public:
   explicit SparseApplyAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("update_slots", &update_slots_));
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
@@ -1339,7 +1345,9 @@ class SparseApplyAdagradOp : public OpKernel {
           auto a = accum_flat.template chip<0>(index);
           auto g = grad_flat.template chip<0>(i);
           auto v = var_flat.template chip<0>(index);
-          a += g.square();
+          if (update_slots_) {
+            a += g.square();
+          }
           v -= g.constant(lr_scalar) * g * a.rsqrt();
         }
       } else {
@@ -1358,7 +1366,9 @@ class SparseApplyAdagradOp : public OpKernel {
                                           " in indices is out of range")));
           T& a = accum_flat(index);
           const T& g = grad_flat(i);
-          a += g * g;
+          if (update_slots_) {
+            a += g * g;
+          }
           var_flat(index) -= lr_scalar * g / Eigen::numext::sqrt(a);
         }
       }
@@ -1369,6 +1379,7 @@ class SparseApplyAdagradOp : public OpKernel {
 
  private:
   bool use_exclusive_lock_;
+  bool update_slots_;
 };
 
 #define REGISTER_KERNELS(T, Tindices)                                \
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index f536a61eb0..495a94f1a1 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -68,7 +68,7 @@ struct ApplyAdagrad {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
                   typename TTypes<T>::Flat accum,
                   typename TTypes<T>::ConstScalar lr,
-                  typename TTypes<T>::ConstFlat grad);
+                  typename TTypes<T>::ConstFlat grad, bool update_slots);
 };
 
 template <typename Device, typename T>
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index 2aa17f2a0f..4bd32592db 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -42,8 +42,10 @@ struct ApplyAdagrad<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
                   typename TTypes<T>::Flat accum,
                   typename TTypes<T>::ConstScalar lr,
-                  typename TTypes<T>::ConstFlat grad) {
-    accum.device(d) += grad.square();
+                  typename TTypes<T>::ConstFlat grad, bool update_slots) {
+    if (update_slots) {
+      accum.device(d) += grad.square();
+    }
     Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
     bcast[0] = grad.dimension(0);
     Eigen::Sizes<1> single;
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index dc7b588898..94ff092a85 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -253,6 +253,7 @@ REGISTER_OP("ApplyAdagrad")
     .Output("out: Ref(T)")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
+    .Attr("update_slots: bool = true")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdagradShapeFn(c, false /* sparse */);
     });
@@ -264,6 +265,7 @@ REGISTER_OP("ResourceApplyAdagrad")
     .Input("grad: T")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
+    .Attr("update_slots: bool = true")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdagradShapeFn(c, false /* sparse */);
     });
@@ -320,6 +322,7 @@ REGISTER_OP("SparseApplyAdagrad")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
+    .Attr("update_slots: bool = true")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdagradShapeFn(c, true /* sparse */);
     });
@@ -333,6 +336,7 @@ REGISTER_OP("ResourceSparseApplyAdagrad")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
+    .Attr("update_slots: bool = true")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdagradShapeFn(c, true /* sparse */);
     });
-- 
GitLab


From 9c7e819352581bf5a97509b1fa5dc71dffa26500 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 10:24:26 -0700
Subject: [PATCH 359/450] Enable all arithmetic optimizations by default.

PiperOrigin-RevId: 194106835
---
 .../core/grappler/optimizers/arithmetic_optimizer.h      | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index c0fe8839ca..344c8281eb 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -57,9 +57,9 @@ class ArithmeticOptimizer : public GraphOptimizer {
     // TODO(ezhulenev): flag do disable TrySimplifyAndReplaceUses in tests.
     // Remove when all optimizers will be migrated to separate stages.
     bool enable_try_simplify_and_replace = true;
-    bool combine_add_to_addn = false;
+    bool combine_add_to_addn = true;
     bool hoist_common_factor_out_of_aggregation = true;
-    bool minimize_broadcasts = false;
+    bool minimize_broadcasts = true;
     bool remove_identity_transpose = true;
     bool remove_redundant_bitcast = true;
     bool remove_redundant_cast = true;
@@ -70,11 +70,6 @@ class ArithmeticOptimizer : public GraphOptimizer {
     static ArithmeticOptimizerOptions Default(
         RewriterConfig::Toggle opt_level) {
       ArithmeticOptimizerOptions options;
-      // TODO(ezhulenev): enable by default after 1.8 release cut
-      if (opt_level == RewriterConfig::AGGRESSIVE) {
-        options.combine_add_to_addn = true;
-        options.minimize_broadcasts = true;
-      }
       return options;
     }
   };
-- 
GitLab


From 55a4a479df8e1fbc8aa726596e6d4591364b3585 Mon Sep 17 00:00:00 2001
From: Sherry Moore <sherrym@google.com>
Date: Tue, 24 Apr 2018 10:31:17 -0700
Subject: [PATCH 360/450] Added a call in
 CheckpointSaverHook.after_create_session to always save checkpoint before the
 first training step.

PiperOrigin-RevId: 194107958
---
 .../python/learn/estimators/estimator_test.py |  4 +-
 tensorflow/python/estimator/estimator_test.py |  4 +-
 .../training/basic_session_run_hooks.py       | 36 ++++++++++--------
 .../training/basic_session_run_hooks_test.py  | 38 ++++++++++++++++---
 4 files changed, 58 insertions(+), 24 deletions(-)

diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index d81a534b79..9e5aaf3118 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -715,7 +715,9 @@ class EstimatorTest(test.TestCase):
     ckpt = checkpoint_state_pb2.CheckpointState()
     text_format.Merge(checkpoint_file_content, ckpt)
     self.assertEqual(ckpt.model_checkpoint_path, 'model.ckpt-5')
-    self.assertAllEqual(['model.ckpt-1', 'model.ckpt-5'],
+    # TODO(b/78461127): Please modify tests to not directly rely on names of
+    # checkpoints.
+    self.assertAllEqual(['model.ckpt-0', 'model.ckpt-5'],
                         ckpt.all_model_checkpoint_paths)
 
   def test_train_save_copy_reload(self):
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index d453e19357..0fea86124c 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -679,8 +679,10 @@ class EstimatorTrainTest(test.TestCase):
     ckpt = checkpoint_state_pb2.CheckpointState()
     text_format.Merge(checkpoint_file_content, ckpt)
     self.assertEqual(ckpt.model_checkpoint_path, 'model.ckpt-5')
+    # TODO(b/78461127): Please modify tests to not directly rely on names of
+    # checkpoints.
     self.assertAllEqual(
-        ['model.ckpt-1', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths)
+        ['model.ckpt-0', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths)
 
   def test_train_save_copy_reload(self):
     tmpdir = tempfile.mkdtemp()
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 3651291bdf..47339e057f 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -434,23 +434,27 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
     for l in self._listeners:
       l.begin()
 
-  def before_run(self, run_context):  # pylint: disable=unused-argument
-    if self._timer.last_triggered_step() is None:
-      # We do write graph and saver_def at the first call of before_run.
-      # We cannot do this in begin, since we let other hooks to change graph and
-      # add variables in begin. Graph is finalized after all begin calls.
-      training_util.write_graph(
-          ops.get_default_graph().as_graph_def(add_shapes=True),
-          self._checkpoint_dir,
-          "graph.pbtxt")
-      saver_def = self._get_saver().saver_def if self._get_saver() else None
-      graph = ops.get_default_graph()
-      meta_graph_def = meta_graph.create_meta_graph_def(
-          graph_def=graph.as_graph_def(add_shapes=True),
-          saver_def=saver_def)
-      self._summary_writer.add_graph(graph)
-      self._summary_writer.add_meta_graph(meta_graph_def)
+  def after_create_session(self, session, coord):
+    global_step = session.run(self._global_step_tensor)
+    # We do write graph and saver_def at the first call of before_run.
+    # We cannot do this in begin, since we let other hooks to change graph and
+    # add variables in begin. Graph is finalized after all begin calls.
+    training_util.write_graph(
+        ops.get_default_graph().as_graph_def(add_shapes=True),
+        self._checkpoint_dir,
+        "graph.pbtxt")
+    saver_def = self._get_saver().saver_def if self._get_saver() else None
+    graph = ops.get_default_graph()
+    meta_graph_def = meta_graph.create_meta_graph_def(
+        graph_def=graph.as_graph_def(add_shapes=True),
+        saver_def=saver_def)
+    self._summary_writer.add_graph(graph)
+    self._summary_writer.add_meta_graph(meta_graph_def)
+    # The checkpoint saved here is the state at step "global_step".
+    self._save(session, global_step)
+    self._timer.update_last_triggered_step(global_step)
 
+  def before_run(self, run_context):  # pylint: disable=unused-argument
     return SessionRunArgs(self._global_step_tensor)
 
   def after_run(self, run_context, run_values):
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index 25962f6bf7..31898562f8 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -466,8 +466,8 @@ class CheckpointSaverHookTest(test.TestCase):
     self.assertEqual(2, global_step_val)
     self.assertEqual({
         'begin': 1,
-        'before_save': 2,
-        'after_save': 2,
+        'before_save': 3,
+        'after_save': 3,
         'end': 1
     }, listener_counts)
 
@@ -490,8 +490,8 @@ class CheckpointSaverHookTest(test.TestCase):
     self.assertEqual(2, global_step_val)
     self.assertEqual({
         'begin': 1,
-        'before_save': 2,
-        'after_save': 2,
+        'before_save': 3,
+        'after_save': 3,
         'end': 1
     }, listener_counts)
 
@@ -523,8 +523,8 @@ class CheckpointSaverHookTest(test.TestCase):
     self.assertEqual(2, global_step_val)
     self.assertEqual({
         'begin': 1,
-        'before_save': 2,
-        'after_save': 2,
+        'before_save': 3,
+        'after_save': 3,
         'end': 1
     }, listener1_counts)
     self.assertEqual(listener1_counts, listener2_counts)
@@ -706,6 +706,7 @@ class CheckpointSaverHookTest(test.TestCase):
       with session_lib.Session() as sess:
         sess.run(self.scaffold.init_op)
         mon_sess = monitored_session._HookedSession(sess, [hook])
+        hook.after_create_session(sess, None)
         mon_sess.run(self.train_op)
       summary_writer.assert_summaries(
           test_case=self,
@@ -718,6 +719,31 @@ class CheckpointSaverHookTest(test.TestCase):
 
     fake_summary_writer.FakeSummaryWriter.uninstall()
 
+  def test_save_checkpoint_before_first_train_step(self):
+    with self.graph.as_default():
+      hook = basic_session_run_hooks.CheckpointSaverHook(
+          self.model_dir, save_steps=2, scaffold=self.scaffold)
+      hook.begin()
+      self.scaffold.finalize()
+      with session_lib.Session() as sess:
+        mon_sess = monitored_session._HookedSession(sess, [hook])
+        sess.run(self.scaffold.init_op)
+        hook.after_create_session(sess, None)
+        # Verifies that checkpoint is saved at step 0.
+        self.assertEqual(0,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+        # Verifies that no checkpoint is saved after one training step.
+        mon_sess.run(self.train_op)
+        self.assertEqual(0,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+        # Verifies that checkpoint is saved after save_steps.
+        mon_sess.run(self.train_op)
+        self.assertEqual(2,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
 
 class CheckpointSaverHookMultiStepTest(test.TestCase):
 
-- 
GitLab


From f6ae3d54b0700ba76b56ebe3c702440f39460d2e Mon Sep 17 00:00:00 2001
From: Guangda Lai <laigd@google.com>
Date: Tue, 24 Apr 2018 10:51:08 -0700
Subject: [PATCH 361/450] Split gpu_id library to a header library and an
 implementation, so when if_static is false and we're building shared objects
 that depend on gpu_id, the implementation won't get linked.

PiperOrigin-RevId: 194111330
---
 tensorflow/core/BUILD | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 843fd7b907..bda87c6aed 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2563,6 +2563,19 @@ tf_cuda_library(
 
 cc_library(
     name = "gpu_id",
+    hdrs = [
+        "common_runtime/gpu/gpu_id.h",
+        "common_runtime/gpu/gpu_id_manager.h",
+    ],
+    deps = [
+        ":lib",
+    ] + if_static([
+        ":gpu_id_impl",
+    ]),
+)
+
+cc_library(
+    name = "gpu_id_impl",
     srcs = ["common_runtime/gpu/gpu_id_manager.cc"],
     hdrs = [
         "common_runtime/gpu/gpu_id.h",
@@ -2612,7 +2625,7 @@ tf_cuda_library(
         ":core_cpu_lib",
         ":framework",
         ":framework_internal",
-        ":gpu_id",
+        ":gpu_id_impl",
         ":gpu_init_impl",
         ":gpu_lib",
         ":graph",
-- 
GitLab


From 09398096284995d8a93c124bdbd70d6e1a44fbc3 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 24 Apr 2018 10:59:10 -0700
Subject: [PATCH 362/450] Update README.md

---
 tensorflow/tools/docker/README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/tools/docker/README.md b/tensorflow/tools/docker/README.md
index f46c56e11a..525f2995ce 100644
--- a/tensorflow/tools/docker/README.md
+++ b/tensorflow/tools/docker/README.md
@@ -16,12 +16,12 @@ quick links here:
 
 We currently maintain two Docker container images:
 
-* `gcr.io/tensorflow/tensorflow` - TensorFlow with all dependencies - CPU only!
+* `tensorflow/tensorflow` - TensorFlow with all dependencies - CPU only!
 
-* `gcr.io/tensorflow/tensorflow:latest-gpu` - TensorFlow with all dependencies
+* `tensorflow/tensorflow:latest-gpu` - TensorFlow with all dependencies
   and support for NVidia CUDA
 
-Note: We also publish the same containers into
+Note: We store all our containers on 
 [Docker Hub](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
 
 
@@ -29,12 +29,12 @@ Note: We also publish the same containers into
 
 Run non-GPU container using
 
-    $ docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow
+    $ docker run -it -p 8888:8888 tensorflow/tensorflow
 
 For GPU support install NVidia drivers (ideally latest) and
 [nvidia-docker](https://github.com/NVIDIA/nvidia-docker). Run using
 
-    $ nvidia-docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow:latest-gpu
+    $ nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:latest-gpu
 
 
 Note: If you would have a problem running nvidia-docker you may try the old method
@@ -44,7 +44,7 @@ it there and try using nvidia-docker as described above.
     $ # The old, not recommended way to run docker with gpu support:
     $ export CUDA_SO=$(\ls /usr/lib/x86_64-linux-gnu/libcuda.* | xargs -I{} echo '-v {}:{}')
     $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    $ docker run -it -p 8888:8888 $CUDA_SO $DEVICES gcr.io/tensorflow/tensorflow:latest-gpu
+    $ docker run -it -p 8888:8888 $CUDA_SO $DEVICES tensorflow/tensorflow:latest-gpu
 
 
 ## More containers
-- 
GitLab


From b7b7ec32b848d6f5a7cf432fb44ceed4c9587078 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Tue, 24 Apr 2018 10:57:00 -0700
Subject: [PATCH 363/450] Add note that setting LD_LIBRARY_PATH after having
 already kicked off a build requires a clean rebuild.

PiperOrigin-RevId: 194112367
---
 tensorflow/docs_src/install/install_sources.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index b186758653..71f066e4cb 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -393,9 +393,9 @@ If you are new to TensorFlow, see @{$get_started/premade_estimators$Getting Star
 If the system outputs an error message instead of a greeting, see [Common
 installation problems](#common_installation_problems).
 
-## Common installation problems
+## Common build and installation problems
 
-The installation problems you encounter typically depend on the
+The build and installation problems you encounter typically depend on the
 operating system.  See the "Common installation problems" section
 of one of the following guides:
 
@@ -448,6 +448,11 @@ Stack Overflow and specify the `tensorflow` tag.
   </td>
 </tr>
 
+<tr>
+  <td><a href="https://stackoverflow.com/q/47080760">47080760</a></td>
+  <td><pre>undefined reference to `cublasGemmEx@libcublas.so.9.0'</pre></td>
+</tr>
+
 </table>
 
 ## Tested source configurations
-- 
GitLab


From 052c53c27956251e4b4952cd862596a9c08584e4 Mon Sep 17 00:00:00 2001
From: Billy Lamberta <blamb@google.com>
Date: Tue, 24 Apr 2018 11:09:09 -0700
Subject: [PATCH 364/450] Review fixes to install_linux

---
 tensorflow/docs_src/install/install_linux.md | 119 +++++++++++++------
 1 file changed, 84 insertions(+), 35 deletions(-)

diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 9b431e49ee..fa82ac9c40 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -116,23 +116,47 @@ There are a few options to install TensorFlow on your machine:
 <a name="InstallingVirtualenv"></a>
 ### Use `pip` in a virtual environment
 
-This is the *recommended* install method. The
-[Virtualenv](https://virtualenv.pypa.io/en/stable/) tool creates virtual Python
-environments that are isolated from other Python development on the same machine.
-In this scenario, you install TensorFlow and its dependencies within a virtual
-environment that is available when *activated*. Virtualenv provides a reliable
-way to install and run TensorFlow while avoiding conflicts with the rest of the
-system.
+Key Point: Using a virtual environment is the recommended install method.
 
-1\. On Ubuntu, install the `pip` and `virtualenv` packages:
+The [Virtualenv](https://virtualenv.pypa.io/en/stable/) tool creates virtual
+Python environments that are isolated from other Python development on the same
+machine. In this scenario, you install TensorFlow and its dependencies within a
+virtual environment that is available when *activated*. Virtualenv provides a
+reliable way to install and run TensorFlow while avoiding conflicts with the rest
+of the system.
+
+##### 1. Install Python, `pip`, and `virtualenv`.
+
+On Ubuntu, Python is automatically installed and `pip` is *usually* installed.
+Confirm the `python` and `pip` versions:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">python -V</code>
+  <code class="devsite-terminal">pip -V  # or: pip3 -V</code>
+</pre>
+
+To install these packages on Ubuntu:
 
 <pre class="prettyprint lang-bsh">
   <code class="devsite-terminal">sudo apt-get install python-pip python-dev python-virtualenv   # for Python 2.7</code>
   <code class="devsite-terminal">sudo apt-get install python3-pip python3-dev python-virtualenv # for Python 3.n</code>
 </pre>
 
-2\. Create a directory for the virtual environment and choose a Python
-interpreter:
+We *recommend* using `pip` version 8.1 or higher. If using a release before
+version 8.1,  upgrade `pip`:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo pip install -U pip</code>
+</pre>
+
+If not using Ubuntu and [setuptools](https://pypi.org/project/setuptools/) is
+installed, use `easy_install` to install `pip`:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">easy_install -U pip</code>
+</pre>
+
+##### 2. Create a directory for the virtual environment and choose a Python interpreter.
 
 <pre class="prettyprint lang-bsh">
   <code class="devsite-terminal">mkdir ~/tensorflow  # somewhere to work out of</code>
@@ -142,7 +166,9 @@ interpreter:
   <code class="devsite-terminal">virtualenv --system-site-packages -p python3 <var>venv</var> # Use Python 3.n</code>
 </pre>
 
-3\. Activate the Virtualenv environment using one of these shell commands:
+##### 3. Activate the Virtualenv environment.
+
+Use one of these shell-specific commands to activate the virtual environment:
 
 <pre class="prettyprint lang-bsh">
   <code class="devsite-terminal">source ~/tensorflow/<var>venv</var>/bin/activate      # bash, sh, ksh, or zsh</code>
@@ -152,26 +178,32 @@ interpreter:
 
 When the Virtualenv is activated, the shell prompt displays as `(venv) $`.
 
-4\. Upgrade `pip` in your virtual environment:
+##### 4. Upgrade `pip` in the virtual environment.
 
-See the [pip installation guide](https://pip.pypa.io/en/stable/installing/) for
-instructions, or use `easy_install`:
+Within the active virtual environment, upgrade `pip`:
 
 <pre class="prettyprint lang-bsh">
-(venv)$ easy_install -U pip
+(venv)$ pip install -U pip
 </pre>
 
-5\. Within an active Virtualenv environment, use one of the following `pip`
-commands to install the TensorFlow package:
+You can install other Python packages within the virtual environment without
+affecting packages outside the `virtualenv`.
+
+##### 5. Install TensorFlow in the virtual environment.
+
+Choose one of the available TensorFlow packages for installation:
+
+* `tensorflow` —Current release for CPU
+* `tensorflow-gpu` —Current release with GPU support
+* `tf-nightly` —Nightly build for CPU
+* `tf-nightly-gpu` —Nightly build with GPU support
+
+Within an active Virtualenv environment, use `pip` to install the package:
 
 <pre class="prettyprint lang-bsh">
-(venv)$ pip install --upgrade tensorflow      # for Python 2.7</code>
-(venv)$ pip3 install --upgrade tensorflow     # for Python 3.n</code>
-(venv)$ pip install --upgrade tensorflow-gpu  # for Python 2.7 and GPU</code>
+  <code class="devsite-terminal">pip install -U tensorflow</code>
 </pre>
 
-Success! TensorFlow is now installed.
-
 Use `pip list` to show the packages installed in the virtual environment.
 [Validate the install](#ValidateYourInstallation) and test the version:
 
@@ -179,6 +211,8 @@ Use `pip list` to show the packages installed in the virtual environment.
 (venv)$ python -c "import tensorflow as tf; print(tf.__version__)"
 </pre>
 
+Success: TensorFlow is now installed.
+
 Use the `deactivate` command to stop the Python virtual environment.
 
 #### Problems
@@ -222,10 +256,9 @@ environment, a system `pip` install is straightforward.
 
 See the
 [REQUIRED_PACKAGES section of setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py)
-for a list of TensorFlow packages that `pip` installs or upgrade`.
+for a list of packages that TensorFlow installs.
 
-
-#### Install Python and `pip`
+##### 1. Install Python, `pip`, and `virtualenv`.
 
 On Ubuntu, Python is automatically installed and `pip` is *usually* installed.
 Confirm the `python` and `pip` versions:
@@ -235,28 +268,42 @@ Confirm the `python` and `pip` versions:
   <code class="devsite-terminal">pip -V  # or: pip3 -V</code>
 </pre>
 
-We *strongly recommend* `pip` or `pip3` version 8.1 or higher. If using a release
-before version 8.1,  upgrade `pip`:
+To install these packages on Ubuntu:
 
 <pre class="prettyprint lang-bsh">
   <code class="devsite-terminal">sudo apt-get install python-pip python-dev   # for Python 2.7</code>
   <code class="devsite-terminal">sudo apt-get install python3-pip python3-dev # for Python 3.n</code>
 </pre>
 
+We *recommend* using `pip` version 8.1 or higher. If using a release before
+version 8.1,  upgrade `pip`:
 
-#### Install TensorFlow
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo pip install -U pip</code>
+</pre>
 
-Install one of the available TensorFlow packages:
+If not using Ubuntu and [setuptools](https://pypi.org/project/setuptools/) is
+installed, use `easy_install` to install `pip`:
 
 <pre class="prettyprint lang-bsh">
-  <code># Select one:</code>
-  <code class="devsite-terminal">sudo pip install tensorflow      # Python 2.7 CPU (no GPU support)</code>
-  <code class="devsite-terminal">sudo pip3 install tensorflow     # Python 3.n CPU (no GPU support)</code>
-  <code class="devsite-terminal">sudo pip install tensorflow-gpu  # Python 2.7 GPU support</code>
-  <code class="devsite-terminal">sudo pip3 install tensorflow-gpu # Python 3.n GPU support</code>
+  <code class="devsite-terminal">easy_install -U pip</code>
 </pre>
 
-Success! TensorFlow is now installed.
+##### 2. Install TensorFlow on system.
+
+Choose one of the available TensorFlow packages for installation:
+
+* `tensorflow` —Current release for CPU
+* `tensorflow-gpu` —Current release with GPU support
+* `tf-nightly` —Nightly build for CPU
+* `tf-nightly-gpu` —Nightly build with GPU support
+
+And use `pip` to install the package for Python 2 or 3:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo pip install -U tensorflow   # Python 2.7</code>
+  <code class="devsite-terminal">sudo pip3 install -U tensorflow  # Python 3.n</code>
+</pre>
 
 Use `pip list` to show the packages installed on the system.
 [Validate the install](#ValidateYourInstallation) and test the version:
@@ -265,6 +312,8 @@ Use `pip list` to show the packages installed on the system.
   <code class="devsite-terminal">python -c "import tensorflow as tf; print(tf.__version__)"</code>
 </pre>
 
+Success: TensorFlow is now installed.
+
 #### Problems
 
 If the above steps failed, try installing the TensorFlow binary using the remote
-- 
GitLab


From aeaec69869f13fc37c3ed28881741dd344e6a150 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 11:18:47 -0700
Subject: [PATCH 365/450] Update ops-related pbtxt files.

PiperOrigin-RevId: 194116315
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 276 ++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 |  28 ++
 2 files changed, 304 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 05dee30ca0..701897f162 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -2121,6 +2121,71 @@ op {
     }
   }
 }
+op {
+  name: "ApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
 op {
   name: "ApplyAdagradDA"
   input_arg {
@@ -43524,6 +43589,65 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceApplyAdagradDA"
   input_arg {
@@ -47876,6 +48000,79 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceSparseApplyAdagradDA"
   input_arg {
@@ -58622,6 +58819,85 @@ op {
     }
   }
 }
+op {
+  name: "SparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
 op {
   name: "SparseApplyAdagradDA"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 2edd15c446..eb43c6fdfb 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -891,6 +891,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
 }
 op {
   name: "ApplyAdagradDA"
@@ -21784,6 +21791,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
   is_stateful: true
 }
 op {
@@ -23150,6 +23164,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
   is_stateful: true
 }
 op {
@@ -27187,6 +27208,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
 }
 op {
   name: "SparseApplyAdagradDA"
-- 
GitLab


From 4a82acf286df1bc10581d91e13e0ab17458e83b4 Mon Sep 17 00:00:00 2001
From: Raghuraman Krishnamoorthi <raghuramank@google.com>
Date: Tue, 24 Apr 2018 11:20:04 -0700
Subject: [PATCH 366/450]  Improve handling of scopes in folding unfused batch
 norms. This change allows folding to work for MobilenetV2 with unfused batch
 norms

PiperOrigin-RevId: 194116535
---
 .../quantize/python/fold_batch_norms.py       | 24 +++++-
 .../quantize/python/fold_batch_norms_test.py  | 79 +++++++++++++++++++
 2 files changed, 100 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index aa0ef64308..6f41722748 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -501,8 +501,27 @@ def _GetBatchNormParams(graph, context, has_scaling):
   bn_decay_var_tensor = None
 
   split_context = context.split('/')
-  base_context = split_context[-1]
-
+  # Matching variable names is brittle and relies on scoping
+  # conventions. Fused batch norm folding is more robust. Support for unfused
+  # batch norms will be deprecated as we move forward. Fused batch norms allow
+  # for faster training and should be used whenever possible.
+  # context contains part of the names of the tensors we are interested in:
+  # For MobilenetV1, the context has repetitions:
+  # MobilenetV1/MobilenetV1/Conv2d_3_depthwise
+  # when the moving_mean tensor has the name:
+  # MobilenetV1/Conv2d_3_depthwise/BatchNorm/moving_mean/read
+  # To pick the correct variable name, it is necessary to ignore the repeating
+  # header.
+
+  # For MobilenetV2, this problem does not exist:
+  # The context is: MobilenetV2/expanded_conv_3/depthwise
+  # and the names of the tensors start with a single MobilenetV2
+  # The moving mean for example, has the name:
+  # MobilenetV2/expanded_conv_3/depthwise/BatchNorm/moving_mean/read
+  # We ignore the first string (MobilenetV1 or MobilenetV2)
+  # in the context to match correctly in both cases
+
+  base_context = '/'.join(split_context[1:])
   oplist = graph.get_operations()
   op_suffix_mean = base_context + '/BatchNorm/moments/Squeeze'
   op_suffix_variance = base_context + '/BatchNorm/moments/Squeeze_1'
@@ -520,7 +539,6 @@ def _GetBatchNormParams(graph, context, has_scaling):
     op_suffix_gamma = base_context + '/BatchNorm/gamma'
     op_suffix_moving_variance = base_context + '/BatchNorm/moving_variance/read'
     op_suffix_moving_mean = base_context + '/BatchNorm/moving_mean/read'
-
   # Parse through list of ops to find relevant ops
   for op in oplist:
     if op.name.endswith(op_suffix_mean):
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
index af31467476..64e8142e7c 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
@@ -134,6 +134,85 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
   def testFoldConv2d(self):
     self._RunTestOverParameters(self._TestFoldConv2d)
 
+  def testMultipleLayerConv2d(self,
+                              relu=nn_ops.relu,
+                              relu_op_name='Relu',
+                              has_scaling=True,
+                              fused_batch_norm=False,
+                              freeze_batch_norm_delay=None):
+    """Tests folding cases for a network with multiple layers.
+
+    Args:
+      relu: Callable that returns an Operation, a factory method for the Relu*.
+      relu_op_name: String, name of the Relu* operation.
+      has_scaling: Bool, when true the batch norm has scaling.
+      fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
+    """
+    g = ops.Graph()
+    with g.as_default():
+      batch_size, height, width = 5, 128, 128
+      inputs = array_ops.zeros((batch_size, height, width, 3))
+      out_depth = 3
+      stride = 1
+      activation_fn = relu
+      scope = 'network/expanded_conv_1/conv'
+      layer1 = conv2d(
+          inputs,
+          out_depth, [5, 5],
+          stride=stride,
+          padding='SAME',
+          weights_initializer=self._WeightInit(0.09),
+          activation_fn=activation_fn,
+          normalizer_fn=batch_norm,
+          normalizer_params=self._BatchNormParams(
+              scale=has_scaling, fused=fused_batch_norm),
+          scope=scope)
+      # Add another layer
+      scope = 'network/expanded_conv_2/conv'
+
+      _ = conv2d(
+          layer1,
+          2 * out_depth, [5, 5],
+          stride=stride,
+          padding='SAME',
+          weights_initializer=self._WeightInit(0.09),
+          activation_fn=activation_fn,
+          normalizer_fn=batch_norm,
+          normalizer_params=self._BatchNormParams(
+              scale=has_scaling, fused=fused_batch_norm),
+          scope=scope)
+
+      fold_batch_norms.FoldBatchNorms(
+          g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
+    folded_mul = g.get_operation_by_name(scope + '/mul_fold')
+    self.assertEqual(folded_mul.type, 'Mul')
+    self._AssertInputOpsAre(folded_mul, [
+        scope + '/correction_mult',
+        self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm)
+    ])
+    self._AssertOutputGoesToOps(folded_mul, g, [scope + '/Conv2D_Fold'])
+
+    folded_conv = g.get_operation_by_name(scope + '/Conv2D_Fold')
+    self.assertEqual(folded_conv.type, 'Conv2D')
+    # Remove :0 at end of name for tensor prior to comparison
+    self._AssertInputOpsAre(folded_conv,
+                            [scope + '/mul_fold', layer1.name[:-2]])
+    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
+
+    folded_add = g.get_operation_by_name(scope + '/add_fold')
+    self.assertEqual(folded_add.type, 'Add')
+    self._AssertInputOpsAre(folded_add, [
+        scope + '/correction_add',
+        self._BathNormBiasName(scope, fused_batch_norm)
+    ])
+    output_op_names = [scope + '/' + relu_op_name]
+    self._AssertOutputGoesToOps(folded_add, g, output_op_names)
+
+    for op in g.get_operations():
+      self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name)
+
   def _TestFoldConv2dUnknownShape(self, relu, relu_op_name, with_bypass,
                                   has_scaling, fused_batch_norm,
                                   freeze_batch_norm_delay):
-- 
GitLab


From 9d2972e6ceb4911458e867d75466e14a31fa1773 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 11:22:49 -0700
Subject: [PATCH 367/450]  show breakdown of execution cost with compute and
 memory cost for op summarization

PiperOrigin-RevId: 194117030
---
 .../core/grappler/costs/virtual_scheduler.cc     | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 0e5c654acf..7f68272950 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -850,14 +850,16 @@ Costs VirtualScheduler::Summary() const {
   VLOG(1) << "Expected max per-op streaming buffers: "
           << graph_costs_.max_per_op_streaming;
 
-  VLOG(1) << "Per-op execution time:";
+  VLOG(1) << "Per-op execution time / compute time / memory time:";
   for (const auto& op_cost_pair : op_to_cost_) {
     const auto& op = op_cost_pair.first;
     const auto& cost = op_cost_pair.second.execution_time.count();
+    const auto& compute_cost = op_cost_pair.second.compute_time.count();
+    const auto& memory_cost = op_cost_pair.second.memory_time.count();
     const bool is_op_cost_accurate = !op_cost_pair.second.inaccurate;
     if (cost) {  // Skip printing out zero-cost ops.
       VLOG(1) << " + " << op << " : " << (is_op_cost_accurate ? "" : "~")
-              << cost;
+              << cost << " / " << compute_cost << " / " << memory_cost;
     }
   }
 
@@ -898,7 +900,8 @@ Costs VirtualScheduler::Summary() const {
             << ", at the end: "
             << strings::HumanReadableNumBytes(state.memory_usage);
 
-    VLOG(1) << "Per-op execution time (and memory usage at peak memory usage):";
+    VLOG(1) << "Per-op execution time compute time / memory time "
+               "(and memory usage at peak memory usage):";
 
     // Profile non-persistent op memory usage.
     for (const auto& node_port : state.mem_usage_snapshot_at_peak) {
@@ -912,6 +915,8 @@ Costs VirtualScheduler::Summary() const {
     for (const auto& op_cost_pair : state.op_to_cost) {
       const auto& op = op_cost_pair.first;
       const auto& cost = op_cost_pair.second.execution_time.count();
+      const auto& compute_cost = op_cost_pair.second.compute_time.count();
+      const auto& memory_cost = op_cost_pair.second.memory_time.count();
       total_compute_time_ns += op_cost_pair.second.execution_time;
       const bool is_op_cost_accurate = !op_cost_pair.second.inaccurate;
       if (!is_op_cost_accurate) {
@@ -930,8 +935,9 @@ Costs VirtualScheduler::Summary() const {
       if (cost || mem_usage_percent > 1.0) {
         // Print out only non-zero cost ops or ops with > 1% memory usage.
         VLOG(1) << " + " << op << " : " << (is_op_cost_accurate ? "" : "~")
-                << cost << " (" << strings::HumanReadableNumBytes(op_mem_usage)
-                << " [" << mem_usage_percent << "%] "
+                << cost << " / " << compute_cost << " / " << memory_cost << " ("
+                << strings::HumanReadableNumBytes(op_mem_usage) << " ["
+                << mem_usage_percent << "%] "
                 << (persisent_ops.count(op) > 0 ? ": persistent op)" : ")");
       }
     }
-- 
GitLab


From d9cca05cbc5a4a7aeade2634e59fbf779965e3a0 Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Tue, 24 Apr 2018 11:24:37 -0700
Subject: [PATCH 368/450] Fix typo in event field name.

PiperOrigin-RevId: 194117352
---
 tensorflow/contrib/lite/profiling/profile_buffer.h     | 10 +++++-----
 .../contrib/lite/profiling/profile_buffer_test.cc      |  4 ++--
 tensorflow/contrib/lite/profiling/profiler_test.cc     |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/lite/profiling/profile_buffer.h b/tensorflow/contrib/lite/profiling/profile_buffer.h
index 3bfe02571b..b2f565376c 100644
--- a/tensorflow/contrib/lite/profiling/profile_buffer.h
+++ b/tensorflow/contrib/lite/profiling/profile_buffer.h
@@ -37,9 +37,9 @@ struct ProfileEvent {
   // Label of the event. This usually describes the event.
   const char* tag;
   // Timestamp in microseconds when the event began.
-  int64_t begin_timestamp_ms;
+  int64_t begin_timestamp_us;
   // Timestamp in microseconds when the event ended.
-  int64_t end_timestamp_ms;
+  int64_t end_timestamp_us;
   // The field containing the type of event. This must be one of the event types
   // in EventType.
   EventType event_type;
@@ -79,8 +79,8 @@ class ProfileBuffer {
     event_buffer_[index].tag = tag;
     event_buffer_[index].event_type = event_type;
     event_buffer_[index].event_metadata = event_metadata;
-    event_buffer_[index].begin_timestamp_ms = timestamp;
-    event_buffer_[index].end_timestamp_ms = 0;
+    event_buffer_[index].begin_timestamp_us = timestamp;
+    event_buffer_[index].end_timestamp_us = 0;
     current_index_++;
     return index;
   }
@@ -103,7 +103,7 @@ class ProfileBuffer {
     }
 
     int event_index = event_handle % max_size;
-    event_buffer_[event_index].end_timestamp_ms = NowMicros();
+    event_buffer_[event_index].end_timestamp_us = NowMicros();
   }
 
   // Returns the size of the buffer.
diff --git a/tensorflow/contrib/lite/profiling/profile_buffer_test.cc b/tensorflow/contrib/lite/profiling/profile_buffer_test.cc
index 0c5f0cd314..b8784cca45 100644
--- a/tensorflow/contrib/lite/profiling/profile_buffer_test.cc
+++ b/tensorflow/contrib/lite/profiling/profile_buffer_test.cc
@@ -49,13 +49,13 @@ TEST(ProfileBufferTest, AddEvent) {
 
   auto event = GetProfileEvents(buffer)[0];
   EXPECT_EQ(event->tag, "hello");
-  EXPECT_GT(event->begin_timestamp_ms, 0);
+  EXPECT_GT(event->begin_timestamp_us, 0);
   EXPECT_EQ(event->event_type, ProfileEvent::EventType::DEFAULT);
   EXPECT_EQ(event->event_metadata, 42);
 
   buffer.EndEvent(event_handle);
   EXPECT_EQ(1, buffer.Size());
-  EXPECT_GE(event->end_timestamp_ms, event->begin_timestamp_ms);
+  EXPECT_GE(event->end_timestamp_us, event->begin_timestamp_us);
 }
 
 TEST(ProfileBufferTest, OverFlow) {
diff --git a/tensorflow/contrib/lite/profiling/profiler_test.cc b/tensorflow/contrib/lite/profiling/profiler_test.cc
index 994523a8fb..7914f36a31 100644
--- a/tensorflow/contrib/lite/profiling/profiler_test.cc
+++ b/tensorflow/contrib/lite/profiling/profiler_test.cc
@@ -30,7 +30,7 @@ namespace {
 void AssertDurationOfEventAroundMs(const ProfileEvent* event,
                                    double expected_ms, double eps_ms) {
   double duration_ms =
-      (event->end_timestamp_ms - event->begin_timestamp_ms) / 1e3;
+      (event->end_timestamp_us - event->begin_timestamp_us) / 1e3;
   EXPECT_NEAR(expected_ms, duration_ms, eps_ms);
 }
 
-- 
GitLab


From ff013946362e7d80c53b82b64a7f5b462808ff8f Mon Sep 17 00:00:00 2001
From: Malcolm Reynolds <mareynolds@google.com>
Date: Tue, 24 Apr 2018 11:26:26 -0700
Subject: [PATCH 369/450] Clarify error message when importing a GraphDef with
 unknown ops.

This should make the situation from github.com/tensorflow/tensorflow/issues/17014 less confusing.

PiperOrigin-RevId: 194117660
---
 tensorflow/python/framework/importer.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 3f8a8c4bef..5112bea48b 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -572,7 +572,14 @@ def import_graph_def(graph_def,
         if node.name in name_to_op:
           raise ValueError('Duplicate name \'%s\' in GraphDef.' % node.name)
         if node.op not in op_dict:
-          raise ValueError('No op named %s in defined operations.' % node.op)
+          raise ValueError(
+              'No op named %s in defined operations. If the Graph you are '
+              'importing uses custom ops or any parts of tf.contrib, you '
+              'should explicitly import the libraries defining those ops '
+              'before loading the Graph. Note that tf.contrib is lazily loaded '
+              'when accessed, so simply referencing (e.g.) '
+              '`tf.contrib.resampler` will cause those ops to be made '
+              'available.' % node.op)
         op_def = op_dict[node.op]
 
         output_types = _OutputTypes(node, op_dict)
-- 
GitLab


From de3e9830aae0904f0d40d37e9da5b113c4a9a0f0 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Tue, 24 Apr 2018 11:29:43 -0700
Subject: [PATCH 370/450] Small refactor of tf.keras aiming at centralizing
 reusable utilities in `utils`.

PiperOrigin-RevId: 194118244
---
 .../_impl/keras/applications/mobilenet.py     |   1 -
 .../keras/_impl/keras/engine/base_layer.py    | 133 ++----------------
 .../keras/_impl/keras/engine/network.py       |  39 ++---
 .../keras/_impl/keras/engine/topology_test.py |   8 +-
 .../keras/layers/advanced_activations.py      |  14 +-
 .../keras/_impl/keras/layers/convolutional.py |   4 +-
 .../keras/layers/convolutional_recurrent.py   |   6 +-
 .../keras/_impl/keras/layers/embeddings.py    |   6 +-
 .../python/keras/_impl/keras/layers/local.py  |  10 +-
 .../python/keras/_impl/keras/layers/merge.py  |  16 +--
 .../python/keras/_impl/keras/layers/noise.py  |   8 +-
 .../keras/_impl/keras/layers/recurrent.py     |  26 ++--
 .../keras/_impl/keras/layers/wrappers.py      |  18 +--
 .../keras/_impl/keras/utils/generic_utils.py  |  30 ++++
 .../keras/_impl/keras/utils/tf_utils.py       |  80 +++++++++++
 15 files changed, 199 insertions(+), 200 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
index 12775fccec..7b7288793d 100644
--- a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
@@ -79,7 +79,6 @@ from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
 from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
 from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import BatchNormalization
diff --git a/tensorflow/python/keras/_impl/keras/engine/base_layer.py b/tensorflow/python/keras/_impl/keras/engine/base_layer.py
index abae6c3785..a3e78c95dc 100644
--- a/tensorflow/python/keras/_impl/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/_impl/keras/engine/base_layer.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import collections
 import inspect  # Necessary supplement to tf_inspect to deal with variadic args.
-import re
 
 import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
@@ -35,6 +34,10 @@ from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.utils import generic_utils
+from tensorflow.python.keras._impl.keras.utils import tf_utils
+# A module that only depends on `keras.layers` import these from here.
+from tensorflow.python.keras._impl.keras.utils.generic_utils import to_snake_case  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.utils.tf_utils import is_tensor_or_tensor_list  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope as vs
@@ -177,7 +180,8 @@ class Layer(checkpointable.CheckpointableBase):
   def _init_set_name(self, name, zero_based=True):
     if not name:
       self._name = unique_layer_name(
-          to_snake_case(self.__class__.__name__), zero_based=zero_based)
+          generic_utils.to_snake_case(self.__class__.__name__),
+          zero_based=zero_based)
     else:
       self._name = name
 
@@ -318,7 +322,7 @@ class Layer(checkpointable.CheckpointableBase):
 
     # Requesting input-conditional updates.
     inputs = nest.flatten(inputs)
-    reachable = get_reachable_from_inputs(inputs, self.updates)
+    reachable = tf_utils.get_reachable_from_inputs(inputs, self.updates)
     updates = []
     for update in self.updates:
       if update in reachable:
@@ -419,7 +423,7 @@ class Layer(checkpointable.CheckpointableBase):
     # The losses we want to return will be part of this set.
     # To avoid unnecessary work, we stop the search in case all of
     # `self.losses` have been retrieved.
-    reachable = get_reachable_from_inputs(inputs, self.losses)
+    reachable = tf_utils.get_reachable_from_inputs(inputs, self.losses)
     losses = []
     for loss in self.losses:
       if loss in reachable:
@@ -639,7 +643,7 @@ class Layer(checkpointable.CheckpointableBase):
       if not hasattr(self, '_call_fn_args'):
         self._call_fn_args = estimator_util.fn_args(self.call)
       if ('mask' in self._call_fn_args and 'mask' not in kwargs and
-          not is_all_none(previous_mask)):
+          not generic_utils.is_all_none(previous_mask)):
         # The previous layer generated a mask, and mask was not explicitly pass
         # to __call__, hence we set previous_mask as the default value.
         kwargs['mask'] = previous_mask
@@ -1615,9 +1619,9 @@ class Node(object):
     # Following 2 properties: input and output shapes.
 
     # List of shape tuples, shapes of input_tensors.
-    self.input_shapes = [static_shape(x) for x in input_tensors]
+    self.input_shapes = [backend.int_shape(x) for x in input_tensors]
     # List of shape tuples, shapes of output_tensors.
-    self.output_shapes = [static_shape(x) for x in output_tensors]
+    self.output_shapes = [backend.int_shape(x) for x in output_tensors]
 
     # Optional keyword arguments to layer's `call`.
     self.arguments = arguments
@@ -1678,91 +1682,6 @@ class DeferredTensor(object):
                                                         self.dtype.name)
 
 
-def shape_type_conversion(fn):
-  """Decorator that handles tuple/TensorShape conversion.
-
-  Used in `compute_output_shape` and `build`.
-
-  Arguments:
-    fn: function to wrap.
-
-  Returns:
-    Wrapped function.
-  """
-
-  def wrapper(instance, input_shape):
-    if input_shape is not None:
-      if isinstance(input_shape, list):
-        input_shape = [
-            tuple(tensor_shape.TensorShape(x).as_list()) for x in input_shape]
-      else:
-        input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
-    output_shape = fn(instance, input_shape)
-    if output_shape is not None:
-      if isinstance(output_shape, list):
-        return [tensor_shape.TensorShape(x) for x in output_shape]
-      return tensor_shape.TensorShape(output_shape)
-
-  return wrapper
-
-
-def object_list_uid(object_list):
-  """Creates a single string from object ids."""
-  object_list = nest.flatten(object_list)
-  return ', '.join([str(abs(id(x))) for x in object_list])
-
-
-def static_shape(x):
-  """Get the static shape of a Tensor, or None if it is unavailable."""
-  if x is None:
-    return None
-  try:
-    return tuple(x.get_shape().as_list())
-  except ValueError:
-    return None
-
-
-def get_reachable_from_inputs(inputs, targets=None):
-  """Returns the set of tensors/ops reachable from `inputs`.
-
-  Stops if all targets have been found (target is optional).
-
-  Only valid in Symbolic mode, not Eager mode.
-
-  Args:
-    inputs: List of tensors.
-    targets: List of tensors.
-
-  Returns:
-    A set of tensors reachable from the inputs (includes the inputs themselves).
-  """
-  reachable = set(inputs)
-  if targets:
-    targets = set(targets)
-  queue = inputs[:]
-
-  while queue:
-    x = queue.pop()
-    if isinstance(x, ops.Operation):
-      outputs = x.outputs[:] or []
-      outputs += x._control_outputs
-    elif isinstance(x, ops.Tensor):
-      outputs = x.consumers()
-    elif isinstance(x, tf_variables.Variable):
-      outputs = [x.op]
-    else:
-      raise TypeError('Expected Operation, Variable, or Tensor, got ' + str(x))
-
-    for y in outputs:
-      if y not in reachable:
-        reachable.add(y)
-        queue.insert(0, y)
-
-    if targets and targets.issubset(reachable):
-      return reachable
-  return reachable
-
-
 def unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace='',
                       zero_based=False):
   """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
@@ -1809,28 +1728,6 @@ def unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace='',
   return proposed_name
 
 
-def to_snake_case(name):
-  intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name)
-  insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower()
-  # If the class is private the name starts with "_" which is not secure
-  # for creating scopes. We prefix the name with "private" in this case.
-  if insecure[0] != '_':
-    return insecure
-  return 'private' + insecure
-
-
-def is_all_none(iterable_or_element):
-  if not isinstance(iterable_or_element, (list, tuple)):
-    iterable = [iterable_or_element]
-  else:
-    iterable = iterable_or_element
-  # We cannot use Python's `any` because the iterable may return Tensors.
-  for element in iterable:
-    if element is not None:
-      return False
-  return True
-
-
 def have_all_keras_metadata(iterable_or_element):
   if not isinstance(iterable_or_element, (list, tuple)):
     iterable = [iterable_or_element]
@@ -1861,14 +1758,6 @@ def collect_previous_mask(input_tensors):
   return masks
 
 
-def is_tensor_or_tensor_list(v):
-  v = nest.flatten(v)
-  if v and isinstance(v[0], ops.Tensor):
-    return True
-  else:
-    return False
-
-
 def get_default_graph_uid_map():
   # TODO(fchollet): refactor this into backend.
   graph = ops.get_default_graph()
diff --git a/tensorflow/python/keras/_impl/keras/engine/network.py b/tensorflow/python/keras/_impl/keras/engine/network.py
index 4127c781eb..9f8ee129aa 100644
--- a/tensorflow/python/keras/_impl/keras/engine/network.py
+++ b/tensorflow/python/keras/_impl/keras/engine/network.py
@@ -32,10 +32,11 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import backend
 from tensorflow.python.keras._impl.keras.engine import base_layer
 from tensorflow.python.keras._impl.keras.engine import saving
 from tensorflow.python.keras._impl.keras.utils import generic_utils
+from tensorflow.python.keras._impl.keras.utils import tf_utils
 from tensorflow.python.keras._impl.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.keras._impl.keras.utils.layer_utils import print_summary as print_layer_summary
 from tensorflow.python.platform import tf_logging as logging
@@ -252,8 +253,8 @@ class Network(base_layer.Layer):
     for x in self.inputs:
       mask = x._keras_mask if hasattr(x, '_keras_mask') else None  # pylint: disable=protected-access
       masks.append(mask)
-    mask_cache_key = (base_layer.object_list_uid(self.inputs) + '_' +
-                      base_layer.object_list_uid(masks))
+    mask_cache_key = (generic_utils.object_list_uid(self.inputs) + '_' +
+                      generic_utils.object_list_uid(masks))
     masks = []
     for x in self.outputs:
       mask = x._keras_mask if hasattr(x, '_keras_mask') else None  # pylint: disable=protected-access
@@ -274,7 +275,7 @@ class Network(base_layer.Layer):
       self.input_names.append(layer.name)
       if layer.is_placeholder:
         self._feed_input_names.append(layer.name)
-        self._feed_input_shapes.append(K.int_shape(self.inputs[i]))
+        self._feed_input_shapes.append(backend.int_shape(self.inputs[i]))
         # layer.input gives an error in eager mode
         if not context.executing_eagerly():
           self._feed_inputs.append(layer.input)
@@ -373,7 +374,7 @@ class Network(base_layer.Layer):
     weights = []
     for layer in self.layers:
       weights += layer.weights
-    return K.batch_get_value(weights)
+    return backend.batch_get_value(weights)
 
   def set_weights(self, weights):
     """Sets the weights of the model.
@@ -389,7 +390,7 @@ class Network(base_layer.Layer):
       for sw, w in zip(layer.weights, layer_weights):
         tuples.append((sw, w))
       weights = weights[num_param:]
-    K.batch_set_value(tuples)
+    backend.batch_set_value(tuples)
 
   def compute_mask(self, inputs, mask):
     if not self._is_graph_network:
@@ -400,8 +401,8 @@ class Network(base_layer.Layer):
       masks = [None for _ in range(len(inputs))]
     else:
       masks = generic_utils.to_list(mask)
-    cache_key = (base_layer.object_list_uid(inputs)
-                 + '_' + base_layer.object_list_uid(masks))
+    cache_key = (generic_utils.object_list_uid(inputs)
+                 + '_' + generic_utils.object_list_uid(masks))
     if cache_key in self._output_mask_cache:
       return self._output_mask_cache[cache_key]
     else:
@@ -515,7 +516,7 @@ class Network(base_layer.Layer):
         relevant_inputs += inputs
       else:
         relevant_inputs.append(inputs)
-    reachable = base_layer.get_reachable_from_inputs(relevant_inputs, updates)
+    reachable = tf_utils.get_reachable_from_inputs(relevant_inputs, updates)
     relevant_conditional_updates = [x for x in updates if x in reachable]
     unconditional_updates = [
         x for x in updates if x._unconditional_update]  # pylint: disable=protected-access
@@ -552,7 +553,7 @@ class Network(base_layer.Layer):
         relevant_inputs += inputs
       else:
         relevant_inputs.append(inputs)
-    reachable = base_layer.get_reachable_from_inputs(relevant_inputs, losses)
+    reachable = tf_utils.get_reachable_from_inputs(relevant_inputs, losses)
     relevant_conditional_losses = [x for x in losses if x in reachable]
     unconditional_losses = [
         x for x in losses if x._unconditional_loss]  # pylint: disable=protected-access
@@ -634,8 +635,8 @@ class Network(base_layer.Layer):
     if not context.executing_eagerly():
       # Try to retrieve cached outputs if the layer has already been called
       # on these exact inputs.
-      cache_key = (base_layer.object_list_uid(inputs)
-                   + '_' + base_layer.object_list_uid(masks))
+      cache_key = (generic_utils.object_list_uid(inputs)
+                   + '_' + generic_utils.object_list_uid(masks))
       if cache_key in self._output_tensor_cache:
         # Cache hit.
         return self._output_tensor_cache[cache_key]
@@ -667,7 +668,7 @@ class Network(base_layer.Layer):
                        ': model has ' + str(len(self._input_layers)) +
                        ' tensor inputs.')
 
-    cache_key = base_layer.object_list_uid(input_shapes)
+    cache_key = generic_utils.object_list_uid(input_shapes)
     if cache_key not in self._output_shape_cache:
       # Cache miss. We have to run the network graph manually (recursive calls
       # to `compute_output_shape`).
@@ -856,7 +857,7 @@ class Network(base_layer.Layer):
     for x in self.outputs:
       assert str(id(x)) in tensor_map, 'Could not compute output ' + str(x)
       tensor, mask = tensor_map[str(id(x))]
-      output_shapes.append(base_layer.static_shape(x))
+      output_shapes.append(backend.int_shape(x))
       output_tensors.append(tensor)
       output_masks.append(mask)
 
@@ -870,14 +871,14 @@ class Network(base_layer.Layer):
     if not context.executing_eagerly():
       # Update cache;
       # keys are based on ids on input tensors and inputs masks.
-      cache_key = (base_layer.object_list_uid(inputs)
-                   + '_' + base_layer.object_list_uid(masks))
+      cache_key = (generic_utils.object_list_uid(inputs)
+                   + '_' + generic_utils.object_list_uid(masks))
       self._output_tensor_cache[cache_key] = output_tensors
       self._output_mask_cache[cache_key] = output_masks
 
       if output_shapes is not None:
-        input_shapes = [base_layer.static_shape(x) for x in inputs]
-        cache_key = base_layer.object_list_uid(input_shapes)
+        input_shapes = [backend.int_shape(x) for x in inputs]
+        cache_key = generic_utils.object_list_uid(input_shapes)
         self._output_shape_cache[cache_key] = output_shapes
 
     return output_tensors, output_masks
@@ -1338,7 +1339,7 @@ class Network(base_layer.Layer):
         'class_name': self.__class__.__name__,
         'config': config,
         'keras_version': keras_version,
-        'backend': K.backend()
+        'backend': backend.backend()
     }
     return model_config
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology_test.py b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
index 49cc1cd3b3..6993a04289 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
@@ -964,16 +964,16 @@ class GraphUtilsTest(test.TestCase):
       x_5 = x_3 * pl_1
 
       self.assertEqual(
-          keras.engine.base_layer.get_reachable_from_inputs([pl_1]),
+          keras.utils.tf_utils.get_reachable_from_inputs([pl_1]),
           {pl_1, x_1, x_4, x_5, x_1.op, x_4.op, x_5.op})
       self.assertEqual(
-          keras.engine.base_layer.get_reachable_from_inputs([pl_1, pl_2]),
+          keras.utils.tf_utils.get_reachable_from_inputs([pl_1, pl_2]),
           {pl_1, pl_2, x_1, x_2, x_4, x_5, x_1.op, x_2.op, x_4.op, x_5.op})
       self.assertEqual(
-          keras.engine.base_layer.get_reachable_from_inputs([pl_3]),
+          keras.utils.tf_utils.get_reachable_from_inputs([pl_3]),
           {pl_3, x_3, x_5, x_3.op, x_5.op})
       self.assertEqual(
-          keras.engine.base_layer.get_reachable_from_inputs([x_3]),
+          keras.utils.tf_utils.get_reachable_from_inputs([x_3]),
           {x_3, x_5, x_5.op})
 
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
index 11ca89d625..89931db3c0 100644
--- a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
@@ -25,7 +25,7 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
+from tensorflow.python.keras._impl.keras.utils import tf_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -64,7 +64,7 @@ class LeakyReLU(Layer):
     base_config = super(LeakyReLU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     return input_shape
 
@@ -119,7 +119,7 @@ class PReLU(Layer):
     else:
       self.shared_axes = list(shared_axes)
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     param_shape = list(input_shape[1:])
     self.param_broadcast = [False] * len(param_shape)
@@ -162,7 +162,7 @@ class PReLU(Layer):
     base_config = super(PReLU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     return input_shape
 
@@ -201,7 +201,7 @@ class ELU(Layer):
     base_config = super(ELU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     return input_shape
 
@@ -241,7 +241,7 @@ class ThresholdedReLU(Layer):
     base_config = super(ThresholdedReLU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     return input_shape
 
@@ -275,6 +275,6 @@ class Softmax(Layer):
     base_config = super(Softmax, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     return input_shape
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional.py b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
index 12b965587f..9971f12773 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
@@ -28,7 +28,6 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
 # imports for backwards namespace compatibility
 # pylint: disable=unused-import
 from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling1D
@@ -39,6 +38,7 @@ from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling2D
 from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling3D
 # pylint: enable=unused-import
 from tensorflow.python.keras._impl.keras.utils import conv_utils
+from tensorflow.python.keras._impl.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
@@ -1731,7 +1731,7 @@ class DepthwiseConv2D(Conv2D):
 
     return outputs
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     if self.data_format == 'channels_first':
       rows = input_shape[2]
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
index 6b2a1d98fe..be25bbc043 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
@@ -28,11 +28,11 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
 from tensorflow.python.keras._impl.keras.layers.recurrent import _generate_dropout_mask
 from tensorflow.python.keras._impl.keras.layers.recurrent import RNN
 from tensorflow.python.keras._impl.keras.utils import conv_utils
 from tensorflow.python.keras._impl.keras.utils import generic_utils
+from tensorflow.python.keras._impl.keras.utils import tf_utils
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -168,7 +168,7 @@ class ConvRNN2D(RNN):
     self.input_spec = [InputSpec(ndim=5)]
     self.states = None
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
@@ -209,7 +209,7 @@ class ConvRNN2D(RNN):
                          for _ in range(2)]
     return output_shape
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     # Note input_shape will be list of shapes of initial states and
     # constants if these are passed in __call__.
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings.py b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
index 07b8726b85..2b353ac007 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
@@ -23,7 +23,7 @@ from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
+from tensorflow.python.keras._impl.keras.utils import tf_utils
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
@@ -114,7 +114,7 @@ class Embedding(Layer):
     self.mask_zero = mask_zero
     self.input_length = input_length
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     self.embeddings = self.add_weight(
         shape=(self.input_dim, self.output_dim),
@@ -130,7 +130,7 @@ class Embedding(Layer):
     else:
       return math_ops.not_equal(inputs, 0)
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     if self.input_length is None:
       return input_shape + (self.output_dim,)
diff --git a/tensorflow/python/keras/_impl/keras/layers/local.py b/tensorflow/python/keras/_impl/keras/layers/local.py
index 13d96e9392..caae820fb3 100644
--- a/tensorflow/python/keras/_impl/keras/layers/local.py
+++ b/tensorflow/python/keras/_impl/keras/layers/local.py
@@ -25,8 +25,8 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils import conv_utils
+from tensorflow.python.keras._impl.keras.utils import tf_utils
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -120,7 +120,7 @@ class LocallyConnected1D(Layer):
     self.bias_constraint = constraints.get(bias_constraint)
     self.input_spec = InputSpec(ndim=3)
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     input_dim = input_shape[2]
     if input_dim is None:
@@ -148,7 +148,7 @@ class LocallyConnected1D(Layer):
     self.input_spec = InputSpec(ndim=3, axes={2: input_dim})
     self.built = True
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     length = conv_utils.conv_output_length(input_shape[1], self.kernel_size[0],
                                            self.padding, self.strides[0])
@@ -307,7 +307,7 @@ class LocallyConnected2D(Layer):
     self.bias_constraint = constraints.get(bias_constraint)
     self.input_spec = InputSpec(ndim=4)
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     if self.data_format == 'channels_last':
       input_row, input_col = input_shape[1:-1]
@@ -350,7 +350,7 @@ class LocallyConnected2D(Layer):
       self.input_spec = InputSpec(ndim=4, axes={-1: input_filter})
     self.built = True
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     if self.data_format == 'channels_first':
       rows = input_shape[2]
diff --git a/tensorflow/python/keras/_impl/keras/layers/merge.py b/tensorflow/python/keras/_impl/keras/layers/merge.py
index 7c87e6c067..2b6cf7c8a9 100644
--- a/tensorflow/python/keras/_impl/keras/layers/merge.py
+++ b/tensorflow/python/keras/_impl/keras/layers/merge.py
@@ -22,7 +22,7 @@ from __future__ import print_function
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
-from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
+from tensorflow.python.keras._impl.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -83,7 +83,7 @@ class _Merge(Layer):
         output_shape.append(i)
     return tuple(output_shape)
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     # Used purely for shape validation.
     if not isinstance(input_shape, list):
@@ -181,7 +181,7 @@ class _Merge(Layer):
     else:
       return self._merge_function(inputs)
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     if input_shape[0] is None:
       output_shape = None
@@ -274,7 +274,7 @@ class Subtract(_Merge):
   ```
   """
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     super(Subtract, self).build(input_shape)
     if len(input_shape) != 2:
@@ -370,7 +370,7 @@ class Concatenate(_Merge):
     self.supports_masking = True
     self._reshape_required = False
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     # Used purely for shape validation.
     if not isinstance(input_shape, list) or len(input_shape) < 2:
@@ -392,7 +392,7 @@ class Concatenate(_Merge):
   def _merge_function(self, inputs):
     return K.concatenate(inputs, axis=self.axis)
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     if not isinstance(input_shape, list):
       raise ValueError('A `Concatenate` layer should be called '
@@ -478,7 +478,7 @@ class Dot(_Merge):
     self.supports_masking = True
     self._reshape_required = False
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     # Used purely for shape validation.
     if not isinstance(input_shape, list) or len(input_shape) != 2:
@@ -523,7 +523,7 @@ class Dot(_Merge):
     output = K.batch_dot(x1, x2, axes)
     return output
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     if not isinstance(input_shape, list) or len(input_shape) != 2:
       raise ValueError('A `Dot` layer should be called '
diff --git a/tensorflow/python/keras/_impl/keras/layers/noise.py b/tensorflow/python/keras/_impl/keras/layers/noise.py
index 72dc7a1ff8..addac5b137 100644
--- a/tensorflow/python/keras/_impl/keras/layers/noise.py
+++ b/tensorflow/python/keras/_impl/keras/layers/noise.py
@@ -22,7 +22,7 @@ import numpy as np
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
+from tensorflow.python.keras._impl.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
@@ -69,7 +69,7 @@ class GaussianNoise(Layer):
     base_config = super(GaussianNoise, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     return input_shape
 
@@ -116,7 +116,7 @@ class GaussianDropout(Layer):
     base_config = super(GaussianDropout, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     return input_shape
 
@@ -188,6 +188,6 @@ class AlphaDropout(Layer):
     base_config = super(AlphaDropout, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     return input_shape
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent.py b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
index f53db987ff..f6d6e1391c 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
@@ -31,8 +31,8 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
-from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
+from tensorflow.python.keras._impl.keras.utils import generic_utils
+from tensorflow.python.keras._impl.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -107,7 +107,7 @@ class StackedRNNCells(Layer):
     # Call the cells in order and store the returned states.
     new_nested_states = []
     for cell, states in zip(self.cells, nested_states):
-      if has_arg(cell.call, 'constants'):
+      if generic_utils.has_arg(cell.call, 'constants'):
         inputs, states = cell.call(inputs, states, constants=constants,
                                    **kwargs)
       else:
@@ -122,14 +122,14 @@ class StackedRNNCells(Layer):
       states += cell_states
     return inputs, states
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     if isinstance(input_shape, list):
       constants_shape = input_shape[1:]
       input_shape = input_shape[0]
     for cell in self.cells:
       if isinstance(cell, Layer):
-        if has_arg(cell.call, 'constants'):
+        if generic_utils.has_arg(cell.call, 'constants'):
           cell.build([input_shape] + constants_shape)
         else:
           cell.build(input_shape)
@@ -429,7 +429,7 @@ class RNN(Layer):
   def states(self, states):
     self._states = states
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
@@ -461,7 +461,7 @@ class RNN(Layer):
     else:
       return output_mask
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     # Note input_shape will be list of shapes of initial states and
     # constants if these are passed in __call__.
@@ -609,11 +609,11 @@ class RNN(Layer):
                        'or `batch_shape` argument to your Input layer.')
 
     kwargs = {}
-    if has_arg(self.cell.call, 'training'):
+    if generic_utils.has_arg(self.cell.call, 'training'):
       kwargs['training'] = training
 
     if constants:
-      if not has_arg(self.cell.call, 'constants'):
+      if not generic_utils.has_arg(self.cell.call, 'constants'):
         raise ValueError('RNN cell does not support constants')
 
       def step(inputs, states):
@@ -884,7 +884,7 @@ class SimpleRNNCell(Layer):
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     self.kernel = self.add_weight(
         shape=(input_shape[-1], self.units),
@@ -1287,7 +1287,7 @@ class GRUCell(Layer):
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     input_dim = input_shape[-1]
     self.kernel = self.add_weight(
@@ -1824,7 +1824,7 @@ class LSTMCell(Layer):
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     input_dim = input_shape[-1]
     self.kernel = self.add_weight(
@@ -2388,7 +2388,7 @@ class Recurrent(Layer):
     self.dropout = 0
     self.recurrent_dropout = 0
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers.py b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
index 9aee5f03b6..34a8eeeb5b 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
@@ -23,11 +23,10 @@ import copy
 
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.engine import base_layer
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
-from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
+from tensorflow.python.keras._impl.keras.utils import generic_utils
+from tensorflow.python.keras._impl.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -183,7 +182,7 @@ class TimeDistributed(Wrapper):
 
   def call(self, inputs, training=None, mask=None):
     kwargs = {}
-    if has_arg(self.layer.call, 'training'):
+    if generic_utils.has_arg(self.layer.call, 'training'):
       kwargs['training'] = training
     uses_learning_phase = False  # pylint: disable=redefined-outer-name
 
@@ -213,7 +212,7 @@ class TimeDistributed(Wrapper):
         input_length = array_ops.shape(inputs)[1]
       # Shape: (num_samples * timesteps, ...). And track the
       # transformation in self._input_map.
-      input_uid = base_layer.object_list_uid(inputs)
+      input_uid = generic_utils.object_list_uid(inputs)
       inputs = array_ops.reshape(inputs, (-1,) + input_shape[2:])
       self._input_map[input_uid] = inputs
       # (num_samples * timesteps, ...)
@@ -305,7 +304,7 @@ class Bidirectional(Wrapper):
     self.forward_layer.set_weights(weights[:nw // 2])
     self.backward_layer.set_weights(weights[nw // 2:])
 
-  @shape_type_conversion
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     output_shape = tuple(self.forward_layer.compute_output_shape(
         input_shape).as_list())
@@ -383,12 +382,13 @@ class Bidirectional(Wrapper):
 
   def call(self, inputs, training=None, mask=None, initial_state=None):
     kwargs = {}
-    if has_arg(self.layer.call, 'training'):
+    if generic_utils.has_arg(self.layer.call, 'training'):
       kwargs['training'] = training
-    if has_arg(self.layer.call, 'mask'):
+    if generic_utils.has_arg(self.layer.call, 'mask'):
       kwargs['mask'] = mask
 
-    if initial_state is not None and has_arg(self.layer.call, 'initial_state'):
+    if initial_state is not None and generic_utils.has_arg(
+        self.layer.call, 'initial_state'):
       forward_state = initial_state[:len(initial_state) // 2]
       backward_state = initial_state[len(initial_state) // 2:]
       y = self.forward_layer.call(inputs, initial_state=forward_state, **kwargs)
diff --git a/tensorflow/python/keras/_impl/keras/utils/generic_utils.py b/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
index 3bbe87f92d..db184d278c 100644
--- a/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
@@ -21,6 +21,7 @@ import binascii
 import codecs
 import marshal
 import os
+import re
 import sys
 import time
 import types as python_types
@@ -28,6 +29,7 @@ import types as python_types
 import numpy as np
 import six
 
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
@@ -526,3 +528,31 @@ def to_list(x):
   if isinstance(x, list):
     return x
   return [x]
+
+
+def object_list_uid(object_list):
+  """Creates a single string from object ids."""
+  object_list = nest.flatten(object_list)
+  return ', '.join([str(abs(id(x))) for x in object_list])
+
+
+def to_snake_case(name):
+  intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name)
+  insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower()
+  # If the class is private the name starts with "_" which is not secure
+  # for creating scopes. We prefix the name with "private" in this case.
+  if insecure[0] != '_':
+    return insecure
+  return 'private' + insecure
+
+
+def is_all_none(iterable_or_element):
+  if not isinstance(iterable_or_element, (list, tuple)):
+    iterable = [iterable_or_element]
+  else:
+    iterable = iterable_or_element
+  # We cannot use Python's `any` because the iterable may return Tensors.
+  for element in iterable:
+    if element is not None:
+      return False
+  return True
diff --git a/tensorflow/python/keras/_impl/keras/utils/tf_utils.py b/tensorflow/python/keras/_impl/keras/utils/tf_utils.py
index 8da5f77777..162e5b2cd6 100644
--- a/tensorflow/python/keras/_impl/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/tf_utils.py
@@ -17,9 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond as smart_module
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.util import nest
 
 
 def smart_cond(pred, true_fn=None, false_fn=None, name=None):
@@ -72,3 +75,80 @@ def constant_value(pred):
   if isinstance(pred, variables.Variable):
     return None
   return smart_module.smart_constant_value(pred)
+
+
+def is_tensor_or_tensor_list(v):
+  v = nest.flatten(v)
+  if v and isinstance(v[0], ops.Tensor):
+    return True
+  else:
+    return False
+
+
+def get_reachable_from_inputs(inputs, targets=None):
+  """Returns the set of tensors/ops reachable from `inputs`.
+
+  Stops if all targets have been found (target is optional).
+
+  Only valid in Symbolic mode, not Eager mode.
+
+  Args:
+    inputs: List of tensors.
+    targets: List of tensors.
+
+  Returns:
+    A set of tensors reachable from the inputs (includes the inputs themselves).
+  """
+  reachable = set(inputs)
+  if targets:
+    targets = set(targets)
+  queue = inputs[:]
+
+  while queue:
+    x = queue.pop()
+    if isinstance(x, ops.Operation):
+      outputs = x.outputs[:] or []
+      outputs += x._control_outputs  # pylint: disable=protected-access
+    elif isinstance(x, ops.Tensor):
+      outputs = x.consumers()
+    elif isinstance(x, variables.Variable):
+      outputs = [x.op]
+    else:
+      raise TypeError('Expected Operation, Variable, or Tensor, got ' + str(x))
+
+    for y in outputs:
+      if y not in reachable:
+        reachable.add(y)
+        queue.insert(0, y)
+
+    if targets and targets.issubset(reachable):
+      return reachable
+  return reachable
+
+
+def shape_type_conversion(fn):
+  """Decorator that handles tuple/TensorShape conversion.
+
+  Used in `compute_output_shape` and `build`.
+
+  Arguments:
+    fn: function to wrap.
+
+  Returns:
+    Wrapped function.
+  """
+
+  def wrapper(instance, input_shape):
+    if input_shape is not None:
+      if isinstance(input_shape, list):
+        input_shape = [
+            tuple(tensor_shape.TensorShape(x).as_list()) for x in input_shape]
+      else:
+        input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
+    output_shape = fn(instance, input_shape)
+    if output_shape is not None:
+      if isinstance(output_shape, list):
+        return [tensor_shape.TensorShape(x) for x in output_shape]
+      return tensor_shape.TensorShape(output_shape)
+
+  return wrapper
-- 
GitLab


From c2b1eebe7e256dda88beb91c7fa7662e01d12f9b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 11:43:35 -0700
Subject: [PATCH 371/450] Updating tests in constant_folding_test.cc so that
 the tests evaluate the original and optimized graphs and check that the
 output is the same.

PiperOrigin-RevId: 194120424
---
 .../optimizers/constant_folding_test.cc       | 80 ++++++++++++++++++-
 1 file changed, 79 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 1acce05909..32dca29e12 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -520,6 +520,25 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_UnknownOutputShape) {
       EXPECT_EQ("Mul", node.op()) << node.name();
     }
   }
+
+  const std::vector<string> fetch = {"mul_0", "mul_4", "mul_8"};
+  auto x_known_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto x_partially_unknown_t =
+      GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
+  auto x_unknown_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 7}));
+  auto expected_tensors =
+      EvaluateNodes(item.graph, fetch,
+                    {{"x_known", x_known_t},
+                     {"x_partially_unknown", x_partially_unknown_t},
+                     {"x_unknown", x_unknown_t}});
+  EXPECT_EQ(fetch.size(), expected_tensors.size());
+  auto tensors = EvaluateNodes(output, fetch,
+                               {{"x_known", x_known_t},
+                                {"x_partially_unknown", x_partially_unknown_t},
+                                {"x_unknown", x_unknown_t}});
+  EXPECT_EQ(fetch.size(), tensors.size());
+  for (int i = 0; i < tensors.size(); i++)
+    test::ExpectTensorNear<float>(expected_tensors[i], tensors[i], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
@@ -572,6 +591,20 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
       EXPECT_TRUE(IsControlInput(node.input(1)));
     }
   }
+  const std::vector<string> fetch = {"addn1"};
+  auto x_partially_unknown_t =
+      GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto x_unknown_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto expected_tensors =
+      EvaluateNodes(item.graph, fetch,
+                    {{"x_partially_unknown", x_partially_unknown_t},
+                     {"x_unknown", x_unknown_t}});
+  EXPECT_EQ(1, expected_tensors.size());
+  auto tensors = EvaluateNodes(output, fetch,
+                               {{"x_partially_unknown", x_partially_unknown_t},
+                                {"x_unknown", x_unknown_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(expected_tensors[0], tensors[0], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, CreateConstNodes) {
@@ -1064,6 +1097,20 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) {
     }
   }
   EXPECT_EQ(9, found);
+
+  auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
+  auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 6}));
+  auto v3_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 6}));
+  const std::vector<string> fetch_nodes = {"i1a", "i1b", "i2a", "i2b",
+                                           "i2c", "i3a", "i3b"};
+  auto tensors_expected = EvaluateNodes(
+      item.graph, fetch_nodes, {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
+  auto tensors = EvaluateNodes(output, fetch_nodes,
+                               {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors.size());
+  for (int i = 0; i < fetch_nodes.size(); i++)
+    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
 }
 
 TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN_MultipleOutputs) {
@@ -1930,6 +1977,14 @@ TEST_F(ConstantFoldingTest, Packing) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
+  const std::vector<string> fetch_nodes = {"i1", "i2"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes);
+  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
+  auto tensors = EvaluateNodes(output, fetch_nodes);
+  EXPECT_EQ(fetch_nodes.size(), tensors.size());
+  for (int i = 0; i < fetch_nodes.size(); i++)
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
+
   // Make sure that the representation of the folded constant is space
   // efficient: in particular, the whole message should be smaller than 8k
   // (the size needed to naively encode 1000 floats folded twice).
@@ -1965,6 +2020,13 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
+  std::vector<string> fetch_nodes = {"o1", "o2", "p1", "p2"};
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1, 5}));
+  auto g_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, fetch_nodes, {{"a", a_t}, {"g", g_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
+
   // Run a second time to make sure the optimization is idempotent.
   item.graph.Swap(&output);
   status = optimizer.Optimize(nullptr, item, &output);
@@ -2005,6 +2067,11 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
     }
   }
   EXPECT_EQ(6, found);
+
+  auto tensors = EvaluateNodes(output, fetch_nodes, {{"a", a_t}, {"g", g_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors.size());
+  for (int i = 0; i < fetch_nodes.size(); i++)
+    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
 }
 
 TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
@@ -2024,6 +2091,11 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  std::vector<string> fetch_nodes = {"o1", "o2"};
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes, {{"a", a_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
+
   ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -2078,6 +2150,10 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
     }
   }
   EXPECT_EQ(7, found);
+  auto tensors = EvaluateNodes(output, fetch_nodes, {{"a", a_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors.size());
+  for (int i = 0; i < fetch_nodes.size(); i++)
+    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
 }
 
 TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
@@ -2539,6 +2615,8 @@ TEST_F(ConstantFoldingTest, TrivialPack) {
   EXPECT_EQ(tensors_expected[0].shape(), tensors[0].shape());
 }
 
+// The test does not evalute the optimized and original graphs to check if their
+// outputs are the same. See b/78233179.
 TEST_F(ConstantFoldingTest, Enter) {
   GrapplerItem item;
   AttrValue frame_name;
@@ -2555,7 +2633,7 @@ TEST_F(ConstantFoldingTest, Enter) {
   value_tensor.AsProtoTensorContent(value.mutable_tensor());
 
   GraphDef& graph = item.graph;
-  AddNode("x", "Placeholder", {}, {{"T", type}}, &graph);
+  AddNode("x", "Placeholder", {}, {{"dtype", type}}, &graph);
   AddNode("c1", "Const", {"^x"}, {{"value", value}, {"dtype", type}}, &graph);
   AddNode("enter1", "Enter", {"x"},
           {{"T", type},
-- 
GitLab


From 9992042548ff268ac97ac3ebf1c584d380b0c106 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 11:46:17 -0700
Subject: [PATCH 372/450] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 194120868

---
 tensorflow/go/op/wrappers.go | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index d038846c4f..4d91f2b68e 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -9602,6 +9602,14 @@ func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
 	}
 }
 
+// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["update_slots"] = value
+	}
+}
+
 // Update '*var' according to the adagrad scheme.
 //
 // accum += grad * grad
@@ -10676,6 +10684,14 @@ func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagrad
 	}
 }
 
+// ResourceSparseApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceSparseApplyAdagradUpdateSlots(value bool) ResourceSparseApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["update_slots"] = value
+	}
+}
+
 // Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
 //
 // That is for rows we have grad for, we update var and accum as follows:
-- 
GitLab


From e6e43da77e9be2e7e455d94e9724983a263f310a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 11:49:35 -0700
Subject: [PATCH 373/450] Clarify error encountered when serializing
 critical_section_executions is a warning.

PiperOrigin-RevId: 194121508
---
 tensorflow/python/framework/meta_graph.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index 391b17720c..923e76fc9c 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -439,9 +439,10 @@ def add_collection_def(meta_graph_def, key, graph=None,
       else:
         getattr(col_def, kind).value.extend([x for x in collection_list])
   except Exception as e:  # pylint: disable=broad-except
-    logging.warning("Error encountered when serializing %s.\n"
+    logging.warning("Issue encountered when serializing %s.\n"
                     "Type is unsupported, or the types of the items don't "
-                    "match field type in CollectionDef.\n%s", key, str(e))
+                    "match field type in CollectionDef. Note this is a warning "
+                    "and probably safe to ignore.\n%s", key, str(e))
     if key in meta_graph_def.collection_def:
       del meta_graph_def.collection_def[key]
     return
-- 
GitLab


From 7afe5df6b12309e20b471ce52a2549e6d6ea1745 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 12:45:45 -0700
Subject: [PATCH 374/450] Extract OptimizeGraph function in meta-optimizer.

PiperOrigin-RevId: 194129729
---
 .../optimizers/constant_folding_test.cc       |   1 -
 .../grappler/optimizers/meta_optimizer.cc     | 261 +++++++++---------
 .../core/grappler/optimizers/meta_optimizer.h |  32 ++-
 3 files changed, 167 insertions(+), 127 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 32dca29e12..25693c5c60 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -2528,7 +2528,6 @@ TEST_F(ConstantFoldingTest, PartialFolding_IdentityN) {
   ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
-  LOG(INFO) << output.DebugString();
   TF_EXPECT_OK(status);
   EXPECT_EQ(8, output.node_size());
   for (const auto& node : output.node()) {
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 335fb403f1..c98eef1a6a 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -36,6 +36,9 @@ namespace tensorflow {
 namespace grappler {
 
 namespace {
+
+constexpr int kDefaultNumberOfIterations = 1;
+
 int64 NumEdges(const GraphDef& graph) {
   int64 num_edges = 0;
   for (const auto& node : graph.node()) {
@@ -50,144 +53,144 @@ string PrintSizesBeforeAfter(const GraphDef& before, const GraphDef& after) {
                          NumEdges(after), " edges (",
                          NumEdges(after) - NumEdges(before), ")");
 }
+
+int NumIterations(const RewriterConfig& cfg) {
+  return cfg.meta_optimizer_iterations() == RewriterConfig::DEFAULT_NUM_ITERS
+             ? kDefaultNumberOfIterations
+             : cfg.meta_optimizer_iterations();
+}
+
+// Check if optimizer is allowed to run only once.
+bool IsRunOnceOptimizer(const string& name) { return name == "layout"; }
+
 }  // namespace
 
-std::unique_ptr<GraphOptimizer> MetaOptimizer::NewOptimizer(
-    const string& optimizer) {
-  std::unique_ptr<GraphOptimizer> graph_optimizer;
-  if (optimizer == "pruning") {
-    graph_optimizer.reset(new ModelPruner());
-  }
-  if (optimizer == "function") {
-    graph_optimizer.reset(new FunctionOptimizer(cfg_.function_optimization()));
+#define MK_OPT(NAME, VALUE) \
+  if (optimizer == NAME) return std::unique_ptr<GraphOptimizer>(VALUE)
+
+std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
+    const string& optimizer) const {
+  MK_OPT("pruning", new ModelPruner());
+  MK_OPT("function", new FunctionOptimizer(cfg_.function_optimization()));
+  MK_OPT("constfold", new ConstantFolding(cpu_device_));
+  MK_OPT("layout", new LayoutOptimizer());
+  MK_OPT("memory", new MemoryOptimizer(RewriterConfig::MANUAL));
+  MK_OPT("arithmetic", new ArithmeticOptimizer(cfg_.arithmetic_optimization()));
+  MK_OPT("autoparallel", new AutoParallel(cfg_.auto_parallel().num_replicas()));
+  MK_OPT("loop", new LoopOptimizer(cfg_.loop_optimization()));
+  MK_OPT("dependency", new DependencyOptimizer(cfg_.dependency_optimization()));
+  MK_OPT("debug_stripper", new DebugStripper());
+
+  return std::unique_ptr<GraphOptimizer>();
+}
+
+#undef MK_OPT
+
+Status MetaOptimizer::InitializeOptimizers(
+    std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const {
+  if (!cfg_.disable_model_pruning()) {
+    optimizers->emplace_back(new ModelPruner());
   }
-  if (optimizer == "constfold") {
-    graph_optimizer.reset(new ConstantFolding(cpu_device_));
+  if (cfg_.function_optimization() != RewriterConfig::OFF) {
+    optimizers->emplace_back(
+        new FunctionOptimizer(cfg_.function_optimization()));
   }
-  if (optimizer == "layout") {
-    graph_optimizer.reset(new LayoutOptimizer());
+  if (cfg_.debug_stripper() == RewriterConfig::ON) {
+    optimizers->emplace_back(new DebugStripper());
   }
-  if (optimizer == "memory") {
-    graph_optimizer.reset(new MemoryOptimizer(RewriterConfig::MANUAL));
+  if (cfg_.constant_folding() != RewriterConfig::OFF) {
+    optimizers->emplace_back(
+        new ConstantFolding(cfg_.constant_folding(), cpu_device_));
   }
-  if (optimizer == "arithmetic") {
-    graph_optimizer.reset(
+  if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
+    optimizers->emplace_back(
         new ArithmeticOptimizer(cfg_.arithmetic_optimization()));
   }
-  if (optimizer == "autoparallel") {
-    graph_optimizer.reset(
-        new AutoParallel(cfg_.auto_parallel().num_replicas()));
-  }
-  if (optimizer == "loop") {
-    graph_optimizer.reset(new LoopOptimizer(cfg_.loop_optimization()));
+  if (cfg_.loop_optimization() != RewriterConfig::OFF) {
+    optimizers->emplace_back(new LoopOptimizer(cfg_.loop_optimization()));
   }
-  if (optimizer == "dependency") {
-    graph_optimizer.reset(
+  if (cfg_.dependency_optimization() != RewriterConfig::OFF) {
+    optimizers->emplace_back(
         new DependencyOptimizer(cfg_.dependency_optimization()));
   }
-  if (optimizer == "debug_stripper") {
-    graph_optimizer.reset(new DebugStripper());
+  if (cfg_.layout_optimizer() != RewriterConfig::OFF) {
+    optimizers->emplace_back(new LayoutOptimizer());
+  }
+  if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) {
+    if (cfg_.memory_optimizer_target_node_name_scope().empty()) {
+      optimizers->emplace_back(
+          // Use the default target node name prefix "gradients/"
+          new MemoryOptimizer(cfg_.memory_optimization()));
+    } else {
+      optimizers->emplace_back(
+          new MemoryOptimizer(cfg_.memory_optimization(),
+                              cfg_.memory_optimizer_target_node_name_scope()));
+    }
   }
-  return graph_optimizer;
+  if (cfg_.auto_parallel().enable()) {
+    optimizers->emplace_back(
+        new AutoParallel(cfg_.auto_parallel().num_replicas()));
+  }
+  return Status::OK();
 }
 
-Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
-                               GraphDef* optimized_graph) {
-  std::vector<std::unique_ptr<GraphOptimizer>> optimizers;
-  if (cfg_.optimizers().empty()) {
-    if (!cfg_.disable_model_pruning()) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(new ModelPruner()));
-    }
-    if (cfg_.function_optimization() != RewriterConfig::OFF) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new FunctionOptimizer(cfg_.function_optimization())));
-    }
-    if (cfg_.debug_stripper() == RewriterConfig::ON) {
-      optimizers.push_back(
-          std::unique_ptr<GraphOptimizer>(new DebugStripper()));
-    }
-    if (cfg_.constant_folding() != RewriterConfig::OFF) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new ConstantFolding(cfg_.constant_folding(), cpu_device_)));
-    }
-    if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new ArithmeticOptimizer(cfg_.arithmetic_optimization())));
-    }
-    if (cfg_.loop_optimization() != RewriterConfig::OFF) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new LoopOptimizer(cfg_.loop_optimization())));
-    }
-    if (cfg_.dependency_optimization() != RewriterConfig::OFF) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new DependencyOptimizer(cfg_.dependency_optimization())));
-    }
-    if (cfg_.layout_optimizer() != RewriterConfig::OFF) {
-      optimizers.push_back(
-          std::unique_ptr<GraphOptimizer>(new LayoutOptimizer()));
-    }
-    if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) {
-      if (cfg_.memory_optimizer_target_node_name_scope().empty()) {
-        optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-            // Use the default target node name prefix "gradients/"
-            new MemoryOptimizer(cfg_.memory_optimization())));
-      } else {
-        optimizers.push_back(
-            std::unique_ptr<GraphOptimizer>(new MemoryOptimizer(
-                cfg_.memory_optimization(),
-                cfg_.memory_optimizer_target_node_name_scope())));
-      }
+Status MetaOptimizer::InitializeOptimizersByName(
+    std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const {
+  for (const string& optimizer_name : cfg_.optimizers()) {
+    auto optimizer = MakeNewOptimizer(optimizer_name);
+    if (optimizer) {
+      VLOG(2) << "Registered default graph optimizer: " << optimizer_name;
+      optimizers->push_back(std::move(optimizer));
+      continue;
     }
-    if (cfg_.auto_parallel().enable()) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new AutoParallel(cfg_.auto_parallel().num_replicas())));
+
+    auto custom_optimizer =
+        CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer_name);
+
+    if (custom_optimizer) {
+      VLOG(2) << "Registered custom graph optimizer: " << optimizer_name;
+      TF_RETURN_IF_ERROR(custom_optimizer->Init());
+      optimizers->push_back(std::move(custom_optimizer));
+    } else {
+      VLOG(2) << "Can't register an optimizer by name: " << optimizer_name;
     }
+  }
+  return Status::OK();
+}
+
+Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
+                                    GraphDef* optimized_graph) {
+  std::vector<std::unique_ptr<GraphOptimizer>> optimizers;
+  if (cfg_.optimizers().empty()) {
+    TF_RETURN_IF_ERROR(InitializeOptimizers(&optimizers));
   } else {
-    const std::set<string> available_optimizers = {
-        "pruning",    "function",      "constfold",  "layout",
-        "memory",     "autoparallel",  "arithmetic", "loop",
-        "dependency", "debug_stripper"};
-    std::vector<string> custom_optimizer_names;
-    for (const auto& optimizer_name : cfg_.optimizers()) {
-      if (available_optimizers.find(optimizer_name) !=
-          available_optimizers.end()) {
-        optimizers.push_back(NewOptimizer(optimizer_name));
-      } else {
-        custom_optimizer_names.push_back(optimizer_name);
-      }
-    }
-    // Now run the custom optimizers.
-    for (const auto& optimizer_name : custom_optimizer_names) {
-      std::unique_ptr<CustomGraphOptimizer> opt =
-          CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer_name);
-      if (opt == nullptr) continue;
-      TF_RETURN_IF_ERROR(opt->Init());
-      optimizers.push_back(std::move(opt));
-    }
+    TF_RETURN_IF_ERROR(InitializeOptimizersByName(&optimizers));
   }
 
+  VLOG(2) << "Optimize GrapplerItem: item.id=" << item.id
+          << " num_optimizers=" << optimizers.size();
+
   if (optimizers.empty()) {
+    VLOG(3) << "Skip graph optimization, no optimizers registered";
     *optimized_graph = item.graph;
     return Status::OK();
   }
 
-  // Some optimizers should be run only once.
-  const std::set<string> run_once_optimizers = {"layout"};
-  bool already_optimized = false;
-  const int num_iterations =
-      cfg_.meta_optimizer_iterations() == RewriterConfig::DEFAULT_NUM_ITERS
-          ? 1
-          : cfg_.meta_optimizer_iterations();
+  // Invariant: optimized_graph contains the most recently optimized version of
+  // the graph.
   GrapplerItem optimized_item = item;
   optimized_graph->Swap(&optimized_item.graph);
-  for (int iteration = 0; iteration < num_iterations; ++iteration) {
-    VLOG(1) << "Starting optimization iteration " << iteration + 1;
+
+  bool is_optimized = false;
+  GraphOptimizationResult optimization_result(item.id);
+
+  for (int iteration = 0; iteration < NumIterations(cfg_); ++iteration) {
+    VLOG(4) << "Starting optimization iteration " << iteration + 1;
+
     for (const auto& optimizer : optimizers) {
-      // Invariant: optimized_graph contains the most recently optimized
-      // version of the graph.
-      if (iteration > 0 && run_once_optimizers.count(optimizer->name())) {
-        continue;
-      }
+      // Some optimizers can run only once.
+      if (iteration > 0 && IsRunOnceOptimizer(optimizer->name())) continue;
+
       uint64 start_us = Env::Default()->NowMicros();
       // This swaps the current optimized_graph into optimized item and
       // resets optimized_graph to an empty graph.
@@ -195,41 +198,53 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       *optimized_graph = GraphDef();
       Status status =
           optimizer->Optimize(cluster, optimized_item, optimized_graph);
-
       uint64 end_us = Env::Default()->NowMicros();
-      float duration_ms = (end_us - start_us) / 1000.0f;
+
       string result;
       if (!status.ok()) {
-        VLOG(1) << "Not able to apply optimizer " << optimizer->name() << ": "
-                << status.ToString();
         optimized_graph->Swap(&optimized_item.graph);
         result = status.ToString();
       } else {
-        already_optimized = true;
+        is_optimized = true;
+        float duration_ms = (end_us - start_us) / 1000.0f;
         result = strings::StrCat(
-            optimizer->name(), ": ",
             PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph),
             ", time = ", duration_ms, "ms.");
       }
-      result_.emplace_back(optimizer->name(), result);
-      VLOG(1) << result;
+      VLOG(4) << optimizer->name() << ": " << result;
+
+      OptimizerResult optimizer_result{optimizer->name(), result};
+      optimization_result.results.push_back(optimizer_result);
     }
   }
 
-  if (already_optimized) {
+  // Record graph optimization result.
+  optimization_results_.push_back(optimization_result);
+
+  if (is_optimized) {
     TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph));
     ReassignColocation(optimized_graph);
     // Make sure that the optimizers preserved the graph version.
     DCHECK_EQ(optimized_graph->versions().producer(),
               item.graph.versions().producer());
   }
+
+  return Status::OK();
+}
+
+Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                               GraphDef* optimized_graph) {
+  optimization_results_.clear();
+  TF_RETURN_IF_ERROR(OptimizeGraph(cluster, item, optimized_graph));
   return Status::OK();
 }
 
 void MetaOptimizer::PrintResult() {
-  for (const auto& result : result_) {
-    LOG(INFO) << "Return status of optimizer " << result.first << ": "
-              << result.second;
+  for (const GraphOptimizationResult& graph_result : optimization_results_) {
+    LOG(INFO) << "Optimization results for grappler item: " << graph_result.id;
+    for (const OptimizerResult& result : graph_result.results) {
+      LOG(INFO) << "  " << result.optimizer_name << ": " << result.result;
+    }
   }
 }
 
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index 382cfe51d4..b8d4666248 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -30,7 +30,7 @@ class MetaOptimizer : public GraphOptimizer {
  public:
   MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg)
       : cpu_device_(cpu_device), cfg_(cfg) {}
-  ~MetaOptimizer() override {}
+  ~MetaOptimizer() override = default;
 
   string name() const override { return "meta_optimizer"; };
 
@@ -43,10 +43,36 @@ class MetaOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override;
 
  private:
-  std::unique_ptr<GraphOptimizer> NewOptimizer(const string& optimizer);
+  std::unique_ptr<GraphOptimizer> MakeNewOptimizer(
+      const string& optimizer) const;
+
+  // Initialize active optimizers from RewriterConfig toggles.
+  Status InitializeOptimizers(
+      std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const;
+  // Initialize active optimizers from RewriterConfig optimizer names.
+  Status InitializeOptimizersByName(
+      std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const;
+
+  // Run optimization pass over a single GrapplerItem. Meta optimizer might run
+  // multiple such passes: 1) for the main graph 2) for the function library
+  Status OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
+                       GraphDef* optimized_graph);
+
   DeviceBase* const cpu_device_;  // may be NULL
   RewriterConfig cfg_;
-  std::vector<std::pair<string, string>> result_;
+
+  struct OptimizerResult {
+    string optimizer_name;
+    string result;
+  };
+
+  struct GraphOptimizationResult {
+    explicit GraphOptimizationResult(const string& id) : id(id) {}
+    string id;
+    std::vector<OptimizerResult> results;
+  };
+
+  std::vector<GraphOptimizationResult> optimization_results_;
 };
 
 bool MetaOptimizerEnabled(const RewriterConfig& cfg);
-- 
GitLab


From 33ffc8e7ff5090b92951c7faac150042dd814085 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 13:08:51 -0700
Subject: [PATCH 375/450] embedding_lookup_sparse documentation change. Remove
 "(typically from FeatureValueToId)" from args descriptions. This appears to
 have been an obsolete reference from an ancestor implementation.

PiperOrigin-RevId: 194133212
---
 tensorflow/python/ops/embedding_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 9e46739bc1..6f2a34c731 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -331,8 +331,8 @@ def embedding_lookup_sparse(params,
       representing sharded embedding tensors.  Alternatively, a
       `PartitionedVariable`, created by partitioning along dimension 0. Each
       element must be appropriately sized for the given `partition_strategy`.
-    sp_ids: N x M `SparseTensor` of int64 ids (typically from FeatureValueToId),
-      where N is typically batch size and M is arbitrary.
+    sp_ids: N x M `SparseTensor` of int64 ids where N is typically batch size
+      and M is arbitrary.
     sp_weights: either a `SparseTensor` of float / double weights, or `None` to
       indicate all weights should be taken to be 1. If specified, `sp_weights`
       must have exactly the same shape and indices as `sp_ids`.
-- 
GitLab


From 893aa776009418c841d49c924207f3cdaf1d5174 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Tue, 24 Apr 2018 13:13:18 -0700
Subject: [PATCH 376/450] Fixing concurrency issues in RPC factory.

PiperOrigin-RevId: 194133903
---
 .../contrib/rpc/python/kernel_tests/BUILD     |   1 -
 .../rpc/python/kernel_tests/rpc_op_test.py    |   1 +
 .../python/kernel_tests/rpc_op_test_base.py   |  60 ++++---
 .../rpc/grpc_rpc_factory.cc                   | 135 +++++++-------
 .../rpc/grpc_rpc_factory.h                    |  18 ++
 tensorflow/core/util/rpc/call_container.h     | 165 +++++++++++++-----
 tensorflow/core/util/rpc/rpc_factory.h        |   5 +-
 7 files changed, 251 insertions(+), 134 deletions(-)

diff --git a/tensorflow/contrib/rpc/python/kernel_tests/BUILD b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
index f3e6731213..2311c15a68 100644
--- a/tensorflow/contrib/rpc/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
@@ -28,7 +28,6 @@ py_library(
 py_library(
     name = "rpc_op_test_base",
     srcs = ["rpc_op_test_base.py"],
-    tags = ["notsan"],
     deps = [
         ":test_example_proto_py",
         "//tensorflow/contrib/proto",
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
index e2e0dbc7a2..3fc6bfbb4d 100644
--- a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
@@ -35,6 +35,7 @@ class RpcOpTest(test.TestCase, rpc_op_test_base.RpcOpTestBase):
   _protocol = 'grpc'
 
   invalid_method_string = 'Method not found'
+  connect_failed_string = 'Connect Failed'
 
   def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
     super(RpcOpTest, self).__init__(methodName)
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
index 89f3ee1a1c..27273d16b1 100644
--- a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
@@ -93,40 +93,39 @@ class RpcOpTestBase(object):
       response_values = sess.run(response_tensors)
     self.assertAllEqual(response_values.shape, [0])
 
-  def testInvalidAddresses(self):
-    with self.test_session() as sess:
-      with self.assertRaisesOpError(self.invalid_method_string):
-        sess.run(
-            self.rpc(
-                method='/InvalidService.IncrementTestShapes',
-                address=self._address,
-                request=''))
+  def testInvalidMethod(self):
+    for method in [
+        '/InvalidService.IncrementTestShapes',
+        self.get_method_name('InvalidMethodName')
+    ]:
+      with self.test_session() as sess:
+        with self.assertRaisesOpError(self.invalid_method_string):
+          sess.run(self.rpc(method=method, address=self._address, request=''))
 
-      with self.assertRaisesOpError(self.invalid_method_string):
-        sess.run(
-            self.rpc(
-                method=self.get_method_name('InvalidMethodName'),
-                address=self._address,
-                request=''))
+        _, status_code_value, status_message_value = sess.run(
+            self.try_rpc(method=method, address=self._address, request=''))
+        self.assertEqual(errors.UNIMPLEMENTED, status_code_value)
+        self.assertTrue(
+            self.invalid_method_string in status_message_value.decode('ascii'))
 
-      # This also covers the case of address=''
-      # and address='localhost:293874293874'
+  def testInvalidAddress(self):
+    # This covers the case of address='' and address='localhost:293874293874'
+    address = 'unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@'
+    with self.test_session() as sess:
       with self.assertRaises(errors.UnavailableError):
         sess.run(
             self.rpc(
                 method=self.get_method_name('IncrementTestShapes'),
-                address='unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@',
+                address=address,
                 request=''))
-
-      # Test invalid method with the TryRpc op
       _, status_code_value, status_message_value = sess.run(
           self.try_rpc(
-              method=self.get_method_name('InvalidMethodName'),
-              address=self._address,
+              method=self.get_method_name('IncrementTestShapes'),
+              address=address,
               request=''))
-      self.assertEqual(errors.UNIMPLEMENTED, status_code_value)
+      self.assertEqual(errors.UNAVAILABLE, status_code_value)
       self.assertTrue(
-          self.invalid_method_string in status_message_value.decode('ascii'))
+          self.connect_failed_string in status_message_value.decode('ascii'))
 
   def testAlwaysFailingMethod(self):
     with self.test_session() as sess:
@@ -138,6 +137,18 @@ class RpcOpTestBase(object):
       with self.assertRaisesOpError(I_WARNED_YOU):
         sess.run(response_tensors)
 
+      response_tensors, status_code, status_message = self.try_rpc(
+          method=self.get_method_name('AlwaysFailWithInvalidArgument'),
+          address=self._address,
+          request='')
+      self.assertEqual(response_tensors.shape, ())
+      self.assertEqual(status_code.shape, ())
+      self.assertEqual(status_message.shape, ())
+      status_code_value, status_message_value = sess.run((status_code,
+                                                          status_message))
+      self.assertEqual(errors.INVALID_ARGUMENT, status_code_value)
+      self.assertTrue(I_WARNED_YOU in status_message_value.decode('ascii'))
+
   def testSometimesFailingMethodWithManyRequests(self):
     with self.test_session() as sess:
       # Fail hard by default.
@@ -197,8 +208,7 @@ class RpcOpTestBase(object):
               address=self._address,
               request=request_tensors) for _ in range(10)
       ]
-      # Launch parallel 10 calls to the RpcOp, each containing
-      # 20 rpc requests.
+      # Launch parallel 10 calls to the RpcOp, each containing 20 rpc requests.
       many_response_values = sess.run(many_response_tensors)
     self.assertEqual(10, len(many_response_values))
     for response_values in many_response_values:
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
index d004abd1c1..cde6b785dc 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
@@ -30,7 +30,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace {
+namespace internal {
 class GrpcCall {
  public:
   explicit GrpcCall(CallContainer<GrpcCall>* container, int index, bool try_rpc,
@@ -57,9 +57,10 @@ class GrpcCall {
     container_->Done(s, index_);
   }
 
+  CallOptions* call_opts() { return &call_opts_; }
+  int index() { return index_; }
   const string& request() const { return *request_msg_; }
   string* response() const { return response_msg_; }
-  CallOptions* call_opts() { return &call_opts_; }
 
  private:
   CallContainer<GrpcCall>* const container_;
@@ -72,7 +73,9 @@ class GrpcCall {
   string* status_message_;
 };
 
-}  // namespace
+}  // namespace internal
+
+using internal::GrpcCall;
 
 GrpcRPCFactory::GrpcRPCFactory(OpKernelConstruction* ctx, bool fail_fast,
                                int64 timeout_in_ms)
@@ -110,28 +113,6 @@ void GrpcRPCFactory::Call(OpKernelContext* ctx, int64 num_elements,
                           Tensor* response_t, Tensor* status_code_t,
                           Tensor* status_message_t,
                           AsyncOpKernel::DoneCallback done) {
-  auto address = address_t.flat<string>();
-  auto method = method_t.flat<string>();
-  auto request = request_t.flat<string>();
-
-  // Stubs are maintained by the GrpcRPCFactory class and will be
-  // deleted when the class is destroyed.
-  ::grpc::GenericStub* singleton_stub = nullptr;
-  if (address.size() == 1) {
-    singleton_stub = GetOrCreateStubForAddress(address(0));
-  }
-  auto get_stub = [&address, this,
-                   singleton_stub](int64 ix) -> ::grpc::GenericStub* {
-    return (address.size() > 1) ? GetOrCreateStubForAddress(address(ix))
-                                : singleton_stub;
-  };
-  auto get_method_ptr = [&method](int64 ix) -> const string* {
-    return (method.size() > 1) ? &(method(ix)) : &(method(0));
-  };
-  auto get_request_ptr = [&request](int64 ix) -> const string* {
-    return (request.size() > 1) ? &(request(ix)) : &(request(0));
-  };
-
   if (try_rpc) {
     // In this case status_code will never be set in the response,
     // so we just set it to OK.
@@ -140,49 +121,22 @@ void GrpcRPCFactory::Call(OpKernelContext* ctx, int64 num_elements,
         static_cast<int>(errors::Code::OK));
   }
 
-  CancellationManager* cm = ctx->cancellation_manager();
-  CancellationToken cancellation_token = cm->get_cancellation_token();
-
-  // This object will delete itself when done.
-  auto* container =
-      new CallContainer<GrpcCall>(ctx, num_elements, fail_fast_, try_rpc,
-                                  std::move(done), cancellation_token);
-
-  auto response = response_t->flat<string>();
-  int32* status_code_ptr = nullptr;
-  string* status_message_ptr = nullptr;
-  if (try_rpc) {
-    status_code_ptr = status_code_t->flat<int32>().data();
-    status_message_ptr = status_message_t->flat<string>().data();
-  }
-  for (int i = 0; i < num_elements; ++i) {
-    container->calls()->emplace_back(
-        container, i, try_rpc, get_request_ptr(i), &response(i),
-        (try_rpc) ? &status_code_ptr[i] : nullptr,
-        (try_rpc) ? &status_message_ptr[i] : nullptr);
-  }
+  CallContainer<GrpcCall>::CreateCallFn create_call_fn =
+      [this, &request_t, &try_rpc, response_t, status_code_t, status_message_t](
+          CallContainer<GrpcCall>* container, int index) {
+        CreateCall(request_t, try_rpc, index, container, response_t,
+                   status_code_t, status_message_t);
+      };
 
-  int i = 0;
-  for (GrpcCall& call : *(container->calls())) {
-    // This object will delete itself when done.
-    new RPCState<string>(get_stub(i), &completion_queue_, *get_method_ptr(i),
-                         call.request(), call.response(),
-                         /*done=*/[&call](const Status& s) { call.Done(s); },
-                         call.call_opts(), fail_fast_, timeout_in_ms_);
-    ++i;
-  }
+  CallContainer<GrpcCall>::StartCallFn start_call_fn =
+      [this, &address_t, &method_t](GrpcCall* call) {
+        StartCall(address_t, method_t, call);
+      };
 
-  // Need to register this callback after all the RPCs are in
-  // flight; otherwise we may try to cancel an RPC *before* it
-  // launches, which is a no-op, and then fall into a deadlock.
-  bool is_cancelled = !cm->RegisterCallback(
-      cancellation_token, [container]() { container->StartCancel(); });
-
-  if (is_cancelled) {
-    ctx->SetStatus(errors::Cancelled("Operation has been cancelled."));
-    // container's reference counter will take care of calling done().
-    container->StartCancel();
-  }
+  // This object will delete itself when done.
+  new CallContainer<GrpcCall>(ctx, num_elements, fail_fast_, try_rpc,
+                              std::move(done), std::move(create_call_fn),
+                              std::move(start_call_fn));
 }
 
 ::grpc::GenericStub* GrpcRPCFactory::GetOrCreateStubForAddress(
@@ -210,4 +164,53 @@ GrpcRPCFactory::ChannelPtr GrpcRPCFactory::CreateChannelForAddress(
       /*target=*/address, ::grpc::InsecureChannelCredentials(), args);
 }
 
+void GrpcRPCFactory::CreateCall(const Tensor& request_t, const bool try_rpc,
+                                int index, CallContainer<GrpcCall>* container,
+                                Tensor* response_t, Tensor* status_code_t,
+                                Tensor* status_message_t) {
+  auto request = request_t.flat<string>();
+  auto get_request_ptr = [&request](int64 ix) -> const string* {
+    return (request.size() > 1) ? &(request(ix)) : &(request(0));
+  };
+  auto response = response_t->flat<string>();
+  int32* status_code_ptr = nullptr;
+  string* status_message_ptr = nullptr;
+  if (try_rpc) {
+    status_code_ptr = status_code_t->flat<int32>().data();
+    status_message_ptr = status_message_t->flat<string>().data();
+  }
+  container->RegisterCall(container, index, try_rpc, get_request_ptr(index),
+                          &response(index),
+                          (try_rpc) ? &status_code_ptr[index] : nullptr,
+                          (try_rpc) ? &status_message_ptr[index] : nullptr);
+}
+
+void GrpcRPCFactory::StartCall(const Tensor& address_t, const Tensor& method_t,
+                               GrpcCall* call) {
+  auto address = address_t.flat<string>();
+  auto method = method_t.flat<string>();
+  // Stubs are maintained by the GrpcRPCFactory class and will be
+  // deleted when the class is destroyed.
+  ::grpc::GenericStub* singleton_stub = nullptr;
+  if (address.size() == 1) {
+    singleton_stub = GetOrCreateStubForAddress(address(0));
+  }
+  auto get_stub = [&address, this,
+                   singleton_stub](int64 ix) -> ::grpc::GenericStub* {
+    return (address.size() > 1) ? GetOrCreateStubForAddress(address(ix))
+                                : singleton_stub;
+  };
+  auto get_method_ptr = [&method](int64 ix) -> const string* {
+    return (method.size() > 1) ? &(method(ix)) : &(method(0));
+  };
+
+  int index = call->index();
+  // This object will delete itself when done.
+  new RPCState<string>(get_stub(index), &completion_queue_,
+                       *get_method_ptr(index), call->request(),
+                       call->response(),
+                       /*done=*/[call](const Status& s) { call->Done(s); },
+                       call->call_opts(), fail_fast_, timeout_in_ms_);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h
index 34ec235aaf..29394c84b5 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h
@@ -20,10 +20,16 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/util/rpc/call_container.h"
 #include "tensorflow/core/util/rpc/rpc_factory.h"
 
 namespace tensorflow {
 
+// Forward declaration of GrpcCall.
+namespace internal {
+class GrpcCall;
+}  // namespace internal
+
 class GrpcRPCFactory : public RPCFactory {
  public:
   explicit GrpcRPCFactory(OpKernelConstruction* ctx, bool fail_fast,
@@ -42,6 +48,18 @@ class GrpcRPCFactory : public RPCFactory {
   virtual ChannelPtr CreateChannelForAddress(const string& address);
 
  private:
+  // Creates a call and registers it with given `container`. The `index` is used
+  // to index into the tensor arguments.
+  void CreateCall(const Tensor& request_t, const bool try_rpc, int index,
+                  CallContainer<internal::GrpcCall>* container,
+                  Tensor* response_t, Tensor* status_code_t,
+                  Tensor* status_message_t);
+
+  // Asynchronously invokes the given `call`. The call completion is handled
+  // by the call container the call was previously registered with.
+  void StartCall(const Tensor& address_t, const Tensor& method_t,
+                 internal::GrpcCall* call);
+
   ::grpc::GenericStub* GetOrCreateStubForAddress(const string& address);
 
   bool fail_fast_;
diff --git a/tensorflow/core/util/rpc/call_container.h b/tensorflow/core/util/rpc/call_container.h
index 7f36056797..e1226a7f16 100644
--- a/tensorflow/core/util/rpc/call_container.h
+++ b/tensorflow/core/util/rpc/call_container.h
@@ -26,53 +26,60 @@ limitations under the License.
 
 namespace tensorflow {
 
-template <typename Call>
+namespace internal {
+// The following class is used for coordination between a `CallContainer`
+// instance and a cancellation callback to make sure that the `CallContainer`
+// instance waits for the cancellation callback to be destroyed (either because
+// a cancellation occurred or because the callback was deregistered) before
+// deleting itself. Without this coordination the cancellation callback could
+// attempt to access a `CallContainer` instance that is no longer valid.
+class NotifyWhenDestroyed {
+ public:
+  explicit NotifyWhenDestroyed(std::shared_ptr<Notification> notification)
+      : notification_(std::move(notification)) {}
+
+  ~NotifyWhenDestroyed() { notification_->Notify(); }
+
+ private:
+  std::shared_ptr<Notification> notification_;
+};
+}  // namespace internal
+
+// The following class is responsible for the life cycle management of a set of
+// RPC calls. The calls are started when an instance of the class is created and
+// the class contract guarantees to invoke a "done" callback provided by the
+// caller when all RPC calls have either completed or been cancelled.
+//
+// The caller should not make any assumptions about the validity of an instance
+// of this class after the provided callback has been invoked, which may be
+// immediately after the instance was created.
+template <class Call>
 class CallContainer {
  public:
+  typedef std::function<void(CallContainer<Call>*, int)> CreateCallFn;
+  typedef std::function<void(Call*)> StartCallFn;
+
+  // Uses the provided `create_call_fn` and `start_call_fn` functions to create
+  // and start a set of RPC calls. When all RPC calls have either completed or
+  // been cancelled, the `done` callback is invoked. The caller should not make
+  // any assumptions about the validity of the created instance as the instance
+  // will delete itself after invoking the `done` callback.
   explicit CallContainer(OpKernelContext* ctx, int num_calls, bool fail_fast,
                          bool try_rpc, AsyncOpKernel::DoneCallback done,
-                         CancellationToken token)
-      : ctx_(ctx),
-        done_(std::move(done)),
-        token_(token),
-        fail_fast_(fail_fast),
-        try_rpc_(try_rpc) {
-    CHECK_GT(num_calls, 0);
-
-    // This will run when all RPCs are finished.
-    reffed_status_callback_ = new ReffedStatusCallback([this](const Status& s) {
-      ctx_->cancellation_manager()->DeregisterCallback(token_);
-      ctx_->SetStatus(s);
-      done_();
-      delete this;
-    });
-
-    // Subtract reference count from the initial creation.
-    core::ScopedUnref unref(reffed_status_callback_);
-
-    for (int i = 0; i < num_calls; ++i) {
-      // Increase the reference on the callback for each new RPC.
-      reffed_status_callback_->Ref();
-    }
-  }
+                         CreateCallFn create_call_fn,
+                         StartCallFn start_call_fn);
 
-  std::list<Call>* calls() { return &calls_; }
+  // Registers a call with this container. This method expects its arguments to
+  // match those of a `Call` constructor as it forwards them to an underlying
+  // collection, which creates a `Call` instance in place.
+  template <class... Args>
+  void RegisterCall(Args&&... args);
 
-  void StartCancel() {
-    // Once this loop is done, can no longer assume anything is valid
-    // because "delete this" may have been immediately called.
-    // Nothing should run after this loop.
-    for (auto& call : calls_) {
-      call.StartCancel();
-    }
-  }
+  // Starts the cancellation of all RPC calls managed by this container.
+  void StartCancel();
 
-  void Done(const Status& s, int index) {
-    if (!try_rpc_) {
-      reffed_status_callback_->UpdateStatus(s);
-    }
-    reffed_status_callback_->Unref();
-  }
+  // Indicates that the `index`-th RPC call has finished.
+  void Done(const Status& s, int index);
 
  private:
   OpKernelContext* ctx_;
@@ -81,10 +88,88 @@ class CallContainer {
   const CancellationToken token_;
   const bool fail_fast_;
   const bool try_rpc_;
+  std::shared_ptr<Notification> callback_destroyed_;
 
   // Performs its own reference counting.
   ReffedStatusCallback* reffed_status_callback_;
 };
 
+template <class Call>
+CallContainer<Call>::CallContainer(
+    OpKernelContext* ctx, int num_calls, bool fail_fast, bool try_rpc,
+    AsyncOpKernel::DoneCallback done,
+    typename CallContainer<Call>::CreateCallFn create_call_fn,
+    typename CallContainer<Call>::StartCallFn start_call_fn)
+    : ctx_(ctx),
+      done_(std::move(done)),
+      token_(ctx->cancellation_manager()->get_cancellation_token()),
+      fail_fast_(fail_fast),
+      try_rpc_(try_rpc),
+      callback_destroyed_(new Notification) {
+  CHECK_GT(num_calls, 0);
+
+  // This will run when all RPCs are finished.
+  reffed_status_callback_ = new ReffedStatusCallback([this](const Status& s) {
+    ctx_->cancellation_manager()->DeregisterCallback(token_);
+    ctx_->SetStatus(s);
+    done_();
+    callback_destroyed_->WaitForNotification();
+    delete this;
+  });
+
+  // The cancellation callback needs to be registered before the RPC calls are
+  // started to make sure that the callback is properly cleaned up by the
+  // `reffed_status_callback` when all calls complete. At the same time, the
+  // cancellation callback should wait for the RPC calls to be started for the
+  // cancellation to take effect.
+  std::shared_ptr<internal::NotifyWhenDestroyed> notify_when_destroyed(
+      new internal::NotifyWhenDestroyed(callback_destroyed_));
+  std::shared_ptr<Notification> calls_started(new Notification);
+  bool is_cancelled = !ctx_->cancellation_manager()->RegisterCallback(
+      token_, [this, calls_started, notify_when_destroyed]() {
+        calls_started->WaitForNotification();
+        StartCancel();
+      });
+
+  for (int i = 0; i < num_calls; ++i) {
+    create_call_fn(this, i);
+    // Increase the reference on the callback for each new RPC.
+    reffed_status_callback_->Ref();
+  }
+  for (Call& call : calls_) {
+    start_call_fn(&call);
+  }
+  calls_started->Notify();
+
+  if (is_cancelled) {
+    ctx_->SetStatus(errors::Cancelled("Operation has been cancelled."));
+    StartCancel();
+  }
+
+  // Subtract reference count from the initial creation.
+  reffed_status_callback_->Unref();
+}
+
+template <class Call>
+template <class... Args>
+void CallContainer<Call>::RegisterCall(Args&&... args) {
+  calls_.emplace_back(std::forward<Args>(args)...);
+}
+
+template <class Call>
+void CallContainer<Call>::StartCancel() {
+  for (auto& call : calls_) {
+    call.StartCancel();
+  }
+}
+
+template <class Call>
+void CallContainer<Call>::Done(const Status& s, int index) {
+  if (!try_rpc_) {
+    reffed_status_callback_->UpdateStatus(s);
+  }
+  reffed_status_callback_->Unref();
+}
+
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_UTIL_RPC_CALL_CONTAINER_H_
diff --git a/tensorflow/core/util/rpc/rpc_factory.h b/tensorflow/core/util/rpc/rpc_factory.h
index 9bf078c0f4..c4eaaf4457 100644
--- a/tensorflow/core/util/rpc/rpc_factory.h
+++ b/tensorflow/core/util/rpc/rpc_factory.h
@@ -32,10 +32,11 @@ class RPCFactory {
   RPCFactory() {}
   virtual ~RPCFactory() {}
 
-  // Start a Call() to methods `method_t` at addresses `address_t` with
+  // Asynchronously invokes methods `method_t` at addresses `address_t` with
   // request strings from `request_t`.  Any of these may be scalar
   // Tensors, in which case the operands are broadcasted.
-  // Upon completion of all requests, `response_t` will be populated.
+  // Upon completion of all requests, `response_t` will be populated and the
+  // `done` callback will be invoked.
   //
   // If `try_rpc` is `true`, then `status_message_t` and
   // `status_code_t` will be populated as well.
-- 
GitLab


From 4355b923c273a4e07655f860a95428b2db977741 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 13:21:49 -0700
Subject: [PATCH 377/450] Implement hoisting of common prefix of unary ops to
 concat.

PiperOrigin-RevId: 194135148
---
 tensorflow/core/grappler/op_types.cc          | 113 ++++++++---
 tensorflow/core/grappler/op_types.h           |   2 +
 .../optimizers/arithmetic_optimizer.cc        | 187 +++++++++++++++++-
 .../optimizers/arithmetic_optimizer.h         |   5 +
 .../optimizers/arithmetic_optimizer_test.cc   | 102 ++++++++++
 5 files changed, 378 insertions(+), 31 deletions(-)

diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 9c45aed62f..f595cf6456 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -451,43 +452,101 @@ OPDEF_PROPERTY_HELPER(Aggregate, aggregate)
 OPDEF_PROPERTY_HELPER(Commutative, commutative)
 
 bool IsInvolution(const NodeDef& node) {
-  const std::unordered_set<string> involution_ops{
-      "Conj", "Reciprocal", "Invert", "Neg", "LogicalNot"};
-  return involution_ops.count(node.op()) > 0;
+  static const std::unordered_set<string>* involution_ops =
+      CHECK_NOTNULL((new std::unordered_set<string>{
+          "Conj", "Reciprocal", "Invert", "Neg", "LogicalNot"}));
+  return involution_ops->count(node.op()) > 0;
 }
 
 bool IsValueAndOrderPreserving(const NodeDef& node) {
   if (NumNonControlInputs(node) == 1 && IsAggregate(node)) {
     return true;
   }
-  const std::unordered_set<string> value_and_order_preserving_ops{
-      "CheckNumerics",
-      "DebugGradientIdentity",
-      "DeepCopy"
-      "Enter",
-      "Exit",
-      "ExpandDims",
-      "Identity",
-      "IdentityN",
-      "PreventGradient",
-      "Print",
-      "Reshape",
-      "Snapshot",
-      "Squeeze",
-      "StopGradient",
-  };
-  return value_and_order_preserving_ops.count(node.op()) > 0;
+  static const std::unordered_set<string>* value_and_order_preserving_ops =
+      CHECK_NOTNULL((new const std::unordered_set<string>{
+          "CheckNumerics",
+          "DebugGradientIdentity",
+          "DeepCopy"
+          "Enter",
+          "Exit",
+          "ExpandDims",
+          "Identity",
+          "IdentityN",
+          "PreventGradient",
+          "Print",
+          "Reshape",
+          "Snapshot",
+          "Squeeze",
+          "StopGradient",
+      }));
+  return value_and_order_preserving_ops->count(node.op()) > 0;
 }
 
 bool IsValuePreserving(const NodeDef& node) {
-  const std::unordered_set<string> value_preserving_ops{
-      "InvertPermutation",
-      "Reverse",
-      "Roll",
-      "Transpose",
-  };
+  static const std::unordered_set<string>* value_preserving_ops =
+      CHECK_NOTNULL((new std::unordered_set<string>{
+          "InvertPermutation",
+          "Reverse",
+          "Roll",
+          "Transpose",
+      }));
   return IsValueAndOrderPreserving(node) ||
-         value_preserving_ops.count(node.op()) > 0;
+         value_preserving_ops->count(node.op()) > 0;
+}
+
+bool IsUnaryElementWise(const NodeDef& node) {
+  static const std::unordered_set<string>* element_wise_ops =
+      CHECK_NOTNULL((new std::unordered_set<string>{
+          "Abs",
+          "Acos",
+          "Acosh",
+          "Asin",
+          "Asinh",
+          "Atan",
+          "Atan2",
+          "Atanh",
+          "Ceil",
+          "ComplexAbs",
+          "Conj",
+          "Cos",
+          "Cosh",
+          "Digamma",
+          "Elu"
+          "Erf",
+          "Erfc",
+          "Exp",
+          "Expm1",
+          "Floor",
+          "Inv",
+          "Invert",
+          "Isinf",
+          "Isnan",
+          "Isfinite",
+          "Lgamma",
+          "Log",
+          "Log1p",
+          "LogicalNot",
+          "Neg",
+          "Reciprocal",
+          "Relu",
+          "Relu6",
+          "Rint",
+          "Round",
+          "Selu",
+          "Rsqrt",
+          "Sigmoid",
+          "Sign",
+          "Sin",
+          "SinH",
+          "Softplus",
+          "Softsign",
+          "Sqrt",
+          "Square",
+          "Tan"
+          "Tanh",
+      }));
+  return element_wise_ops->count(node.op()) > 0 ||
+         (!IsIdentityN(node) && IsValueAndOrderPreserving(node));
 }
 
 bool HasOpDef(const NodeDef& node) {
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 79fd05e187..7f5da19d90 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -177,6 +177,8 @@ bool IsValueAndOrderPreserving(const NodeDef& node);
 // function returns true if the op commutes with all element-wise operations.
 bool IsValuePreserving(const NodeDef& node);
 
+bool IsUnaryElementWise(const NodeDef& node);
+
 // Returns true if we can find an opdef corresponding to the op of the node.
 bool HasOpDef(const NodeDef& node);
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index ed199c1ac8..866b993e93 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -1340,6 +1340,182 @@ class RemoveNegationStage : public ArithmeticOptimizerStage {
   }
 };
 
+// This optimization hoists the common prefix of unary ops of the inputs to
+// concat out of the concat.
+// For example: Concat([Exp(Sin(x)), Exp(Sin(y)), Exp(Sin(z))]) ->
+// Exp(Sin(Concat([x, y, z]))).
+// TODO(rmlarsen): Support casting. We would have to change the type attribute
+// on the concat node.
+class HoistCWiseUnaryFromConcatStage : public ArithmeticOptimizerStage {
+ public:
+  explicit HoistCWiseUnaryFromConcatStage(
+      const GraphOptimizerContext& ctx,
+      const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("", ctx, ctx_ext) {}
+
+  ~HoistCWiseUnaryFromConcatStage() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    if (!IsConcat(*node)) return false;
+    const int n = node->attr().at("N").i();
+    return n > 1;
+  }
+
+  Status TrySimplify(NodeDef* concat_node,
+                     string* simplified_node_name) override {
+    int prefix_length;
+    std::set<string> ctrl_inputs;
+    TF_RETURN_IF_ERROR(
+        FindCommonUnaryOpPrefix(*concat_node, &prefix_length, &ctrl_inputs));
+    if (prefix_length > 0) {
+      TF_RETURN_IF_ERROR(
+          HoistUnaryOpPrefix(prefix_length, &ctrl_inputs, concat_node));
+      AddToOptimizationQueue(concat_node);
+    }
+    return Status::OK();
+  }
+
+ private:
+  void RemoveControlInputs(std::set<string>* removed_ctrl_inputs,
+                           NodeDef* node) const {
+    const int num_inputs = node->input_size();
+    for (int idx = num_inputs - 1; idx >= 0; --idx) {
+      const string& input = node->input(idx);
+      if (IsControlInput(input)) {
+        removed_ctrl_inputs->insert(input);
+        ctx().node_map->RemoveOutput(NodeName(input), node->name());
+        node->mutable_input()->RemoveLast();
+      } else {
+        break;
+      }
+    }
+  }
+
+  void AddControlInputs(std::set<string>* new_ctrl_inputs,
+                        NodeDef* node) const {
+    for (int idx = node->input_size() - 1; idx >= 0; --idx) {
+      const string& existing_input = node->input(idx);
+      if (IsControlInput(existing_input)) {
+        new_ctrl_inputs->erase(existing_input);
+      } else {
+        break;
+      }
+    }
+    for (const string& new_input : *new_ctrl_inputs) {
+      ctx().node_map->AddOutput(NodeName(new_input), node->name());
+      node->add_input(new_input);
+    }
+  }
+
+  // Returns the length of the common unary prefix chain of ops that can be
+  // hoisted out of concat.
+  Status FindCommonUnaryOpPrefix(const NodeDef& concat_node, int* prefix_length,
+                                 std::set<string>* ctrl_inputs) const {
+    *prefix_length = 0;
+    const int n = concat_node.attr().at("N").i();
+    // Follow the chains backwards from each concat input as long as all the
+    // following conditions hold:
+    //   1. The ops in all chains are the same.
+    //   2. The op is a unary elemenwise op.
+    //   3. The op output has only a single consumer.
+    std::vector<NodeDef*> tail(n, nullptr);
+    const int start = concat_node.op() == "Concat" ? 1 : 0;
+    const int end = start + n;
+    // Set up tail pointers to point to the immediate inputs to Concat.
+    for (int i = start; i < end; ++i) {
+      if (IsControlInput(concat_node.input(i))) {
+        return errors::FailedPrecondition("Got control input ",
+                                          concat_node.input(i),
+                                          " where normal input was expected.");
+      }
+      TF_RETURN_IF_ERROR(GetInputNode(concat_node.input(i), &tail[i - start]));
+    }
+
+    bool stop = false;
+    ctrl_inputs->clear();
+    while (!stop) {
+      const NodeDef* tail0 = tail[0];
+      if (!IsUnaryElementWise(*tail0)) break;
+      for (int chain = 0; chain < n; ++chain) {
+        // TODO(rmlarsen): Allow and hoist outgoing control edges.
+        if (tail[chain]->op() != tail0->op() ||
+            ctx().node_map->GetOutputs(tail[chain]->name()).size() > 1) {
+          stop = true;
+          break;
+        }
+      }
+      if (stop) break;
+      // We found one more op that can be hoisted.
+      ++(*prefix_length);
+      for (int chain = 0; chain < n; ++chain) {
+        RemoveControlInputs(ctrl_inputs, tail[chain]);
+      }
+      // Advance tail pointers to the next level.
+      for (int chain = 0; chain < n; ++chain) {
+        if (tail[chain]->input_size() == 0 ||
+            IsControlInput(tail[chain]->input(0))) {
+          stop = true;
+          break;
+        } else {
+          NodeDef* new_tail = nullptr;
+          TF_RETURN_IF_ERROR(GetInputNode(tail[chain]->input(0), &new_tail));
+          tail[chain] = new_tail;
+        }
+      }
+    }
+    return Status::OK();
+  }
+
+  Status HoistUnaryOpPrefix(const int prefix_length,
+                            std::set<string>* ctrl_inputs,
+                            NodeDef* concat_node) {
+    const int n = concat_node->attr().at("N").i();
+    const int start = concat_node->op() == "Concat" ? 1 : 0;
+    const int end = start + n;
+    const std::set<NodeDef*> consumers =
+        ctx().node_map->GetOutputs(concat_node->name());
+    AddControlInputs(ctrl_inputs, concat_node);
+    for (int chain = 0; chain < (end - start); ++chain) {
+      NodeDef* tail = nullptr;
+      const string concat_input = concat_node->input(chain + start);
+      for (int distance = 0; distance < prefix_length; ++distance) {
+        if (distance == 0) {
+          TF_RETURN_IF_ERROR(GetInputNode(concat_input, &tail));
+        } else {
+          TF_RETURN_IF_ERROR(GetInputNode(tail->input(0), &tail));
+        }
+      }
+
+      // Hook the node following tail directly into the concat node.
+      const string tail_input = tail->input(0);
+      concat_node->set_input(chain + start, tail_input);
+      ctx().node_map->UpdateInput(concat_node->name(), concat_input,
+                                  tail_input);
+
+      if (chain == 0) {
+        // Reuse nodes in the first chain to process output of concat.
+        tail->set_input(0, concat_node->name());
+        ctx().node_map->UpdateInput(tail->name(), tail_input,
+                                    concat_node->name());
+
+        // Update the consumers of concat to consume the end of the chain
+        // instead.
+        for (NodeDef* consumer : consumers) {
+          for (int idx = 0; idx < consumer->input_size(); ++idx) {
+            if (consumer->input(idx) == concat_node->name()) {
+              consumer->set_input(idx, concat_input);
+              ctx().node_map->UpdateInput(consumer->name(), concat_node->name(),
+                                          concat_input);
+            }
+          }
+          AddToOptimizationQueue(consumer);
+        }
+      }
+    }
+    return Status::OK();
+  }
+};
+
 }  // namespace
 
 class UniqueNodes {
@@ -1995,6 +2171,8 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
     pipeline.AddStage<RemoveRedundantCastStage>(ctx, ctx_ext);
   if (options_.remove_negation)
     pipeline.AddStage<RemoveNegationStage>(ctx, ctx_ext);
+  if (options_.hoist_unary_out_of_concat)
+    pipeline.AddStage<HoistCWiseUnaryFromConcatStage>(ctx, ctx_ext);
 
   VLOG(1) << "Run " << pipeline.NumStages() << " arithmetic optimizer stages: "
           << str_util::Join(pipeline.StageNames(), ", ");
@@ -2062,17 +2240,18 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
   nodes_to_preserve_ = item.NodesToPreserve();
   fetch_nodes_known_ = !item.fetch.empty();
   *optimized_graph = item.graph;
-  optimized_graph_ = optimized_graph;
+  GrapplerItem optimized_item(item, optimized_graph);
+  optimized_graph_ = &optimized_item.graph;
   node_map_.reset(new NodeMap(optimized_graph_));
 
-  DedupComputations();
+  if (options_.dedup_computations) {
+    DedupComputations();
+  }
 
   // Perform topological sort on the graph in order to help AddOpsRewrite to
   // optimize larger subgraphs starting from the roots with more inputs.
   TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph_));
 
-  GrapplerItem optimized_item(item, optimized_graph);
-  optimized_graph_ = &optimized_item.graph;
   graph_properties_.reset(new GraphProperties(optimized_item));
   const Status status = graph_properties_->InferStatically(false);
   const bool can_use_shapes = status.ok();
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 344c8281eb..375f13acc1 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -56,6 +56,7 @@ class ArithmeticOptimizer : public GraphOptimizer {
   struct ArithmeticOptimizerOptions {
     // TODO(ezhulenev): flag do disable TrySimplifyAndReplaceUses in tests.
     // Remove when all optimizers will be migrated to separate stages.
+    bool dedup_computations = true;
     bool enable_try_simplify_and_replace = true;
     bool combine_add_to_addn = true;
     bool hoist_common_factor_out_of_aggregation = true;
@@ -64,12 +65,16 @@ class ArithmeticOptimizer : public GraphOptimizer {
     bool remove_redundant_bitcast = true;
     bool remove_redundant_cast = true;
     bool remove_negation = true;
+    bool hoist_unary_out_of_concat = false;
 
     // Choose which arithmetic optimizer stages will be enabled for a given
     // optimization level by default.
     static ArithmeticOptimizerOptions Default(
         RewriterConfig::Toggle opt_level) {
       ArithmeticOptimizerOptions options;
+      if (opt_level == RewriterConfig::AGGRESSIVE) {
+        options.hoist_unary_out_of_concat = true;
+      }
       return options;
     }
   };
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index cb1f2ea732..df10dbdf48 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -98,6 +98,7 @@ class ArithmeticOptimizerTest : public GrapplerTest {
   // should explicitly enable required optimization for tests isolation
   void DisableAllStages(ArithmeticOptimizer* optimizer) {
     ArithmeticOptimizer::ArithmeticOptimizerOptions options;
+    options.dedup_computations = false;
     options.enable_try_simplify_and_replace = false;
     options.combine_add_to_addn = false;
     options.hoist_common_factor_out_of_aggregation = false;
@@ -147,6 +148,10 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     DisableAllStages(optimizer);
     optimizer->options_.remove_negation = true;
   }
+  void EnableOnlyHoistCWiseUnaryFromConcat(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.hoist_unary_out_of_concat = true;
+  }
 };
 
 TEST_F(ArithmeticOptimizerTest, NoOp) {
@@ -2086,5 +2091,102 @@ TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_BuildTreeUp) {
   EXPECT_EQ("mul1", mul3_node->input(1));
 }
 
+TEST_F(ArithmeticOptimizerTest, HoistCWiseUnaryFromConcat) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
+  Output b = ops::Variable(s.WithOpName("b"), {32}, DT_FLOAT);
+  Output c = ops::Variable(s.WithOpName("c"), {32}, DT_FLOAT);
+  Output axis = ops::Const(s.WithOpName("axis"), 0, {});
+  Output ctrl1 = ops::Const(s.WithOpName("ctrl1"), 1, {});
+  Output ctrl2 = ops::Const(s.WithOpName("ctrl2"), 2, {});
+  Output ctrl3 = ops::Const(s.WithOpName("ctrl3"), 3, {});
+  // Test case with chains of length 1.
+  Output sin_a =
+      ops::Sin(s.WithOpName("sin_a").WithControlDependencies(ctrl3), a);
+  Output exp_a =
+      ops::Exp(s.WithOpName("exp_a").WithControlDependencies(ctrl1), sin_a);
+  Output exp_b = ops::Exp(s.WithOpName("exp_b"), b);
+  Output exp_c =
+      ops::Exp(s.WithOpName("exp_c").WithControlDependencies(ctrl2), c);
+  Output concat =
+      ops::Concat(s.WithOpName("concat"), {exp_a, exp_b, exp_c}, axis);
+  Output id = ops::Identity(s.WithOpName("id"), concat);
+
+  // Test case with chains of length 2.
+  Output exp_a2 =
+      ops::Exp(s.WithOpName("exp_a2").WithControlDependencies(ctrl1), sin_a);
+  Output exp_b2 = ops::Exp(s.WithOpName("exp_b2"), b);
+  Output exp_c2 =
+      ops::Exp(s.WithOpName("exp_c2").WithControlDependencies(ctrl2), c);
+  Output cos_exp_a2 = ops::Cos(
+      s.WithOpName("cos_exp_a2").WithControlDependencies(ctrl1), exp_a2);
+  Output cos_exp_b2 = ops::Cos(
+      s.WithOpName("cos_exp_b2").WithControlDependencies(ctrl3), exp_b2);
+  Output cos_exp_c2 = ops::Cos(s.WithOpName("cos_exp_c2"), exp_c2);
+  Output concat2 = ops::Concat(s.WithOpName("concat2"),
+                               {cos_exp_a2, cos_exp_b2, cos_exp_c2}, axis);
+  Output id2 = ops::Identity(s.WithOpName("id2"), concat2);
+  GrapplerItem item;
+  item.fetch = {"id", "id2"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyHoistCWiseUnaryFromConcat(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "concat") {
+      EXPECT_EQ(6, node.input_size());
+      EXPECT_EQ("sin_a", node.input(0));
+      EXPECT_EQ("b", node.input(1));
+      EXPECT_EQ("c", node.input(2));
+      EXPECT_EQ("axis", node.input(3));
+      EXPECT_EQ("^ctrl1", node.input(4));
+      EXPECT_EQ("^ctrl2", node.input(5));
+      found++;
+    }
+    if (node.name() == "exp_a") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("concat", node.input(0));
+      found++;
+    }
+    if (node.name() == "id") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("exp_a", node.input(0));
+      found++;
+    }
+
+    if (node.name() == "concat2") {
+      EXPECT_EQ(7, node.input_size());
+      EXPECT_EQ("sin_a", node.input(0));
+      EXPECT_EQ("b", node.input(1));
+      EXPECT_EQ("c", node.input(2));
+      EXPECT_EQ("axis", node.input(3));
+      EXPECT_EQ("^ctrl1", node.input(4));
+      EXPECT_EQ("^ctrl2", node.input(5));
+      EXPECT_EQ("^ctrl3", node.input(6));
+      found++;
+    }
+    if (node.name() == "exp_a2") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("concat2", node.input(0));
+      found++;
+    }
+    if (node.name() == "cos_exp_a2") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("exp_a2", node.input(0));
+      found++;
+    }
+    if (node.name() == "id2") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("cos_exp_a2", node.input(0));
+      found++;
+    }
+  }
+  EXPECT_EQ(7, found);
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
-- 
GitLab


From a3691c4af225126e14b0df1f30969899b33de243 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 13:47:35 -0700
Subject: [PATCH 378/450] - Add a way to specify custom updater args to
 updaters in the optimizer. - Create RegAdagradOptimizer which allows the user
 to specify whether a gradient update is allowed to update the slot vars.

PiperOrigin-RevId: 194139121
---
 tensorflow/contrib/opt/BUILD                  |  20 +
 .../python/training/reg_adagrad_optimizer.py  | 107 ++++++
 .../training/reg_adagrad_optimizer_test.py    | 343 ++++++++++++++++++
 3 files changed, 470 insertions(+)
 create mode 100644 tensorflow/contrib/opt/python/training/reg_adagrad_optimizer.py
 create mode 100644 tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py

diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 612ecc3e63..13aa1d7e7a 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -25,6 +25,7 @@ py_library(
         "python/training/multitask_optimizer_wrapper.py",
         "python/training/nadam_optimizer.py",
         "python/training/powersign.py",
+        "python/training/reg_adagrad_optimizer.py",
         "python/training/sign_decay.py",
         "python/training/variable_clipping_optimizer.py",
     ],
@@ -155,6 +156,25 @@ py_test(
     ],
 )
 
+py_test(
+    name = "reg_adagrad_optimizer_test",
+    srcs = ["python/training/reg_adagrad_optimizer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "nadam_optimizer_test",
     srcs = ["python/training/nadam_optimizer_test.py"],
diff --git a/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer.py b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer.py
new file mode 100644
index 0000000000..d0e0405a2c
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer.py
@@ -0,0 +1,107 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RegAdagrad for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import adagrad
+from tensorflow.python.training import training_ops
+from tensorflow.python.util import tf_contextlib
+
+
+class RegAdagradOptimizer(adagrad.AdagradOptimizer):
+  """RegAdagrad: Adagrad with updates that optionally skip updating the slots.
+
+  This is meant to address the problem of additional regularization terms in the
+  loss function affecting learning rate decay and causing hyper-param
+  entanglement. Example usage:
+
+    loss = tf.nn.cross_entropy(x, labels)
+    reg_loss = reg_strength * tf.reduce_sum(x * x)
+    opt = tf.contrib.opt.RegAdagradOptimizer(learning_rate)
+    loss_update = opt.minimize(loss)
+    with opt.avoid_updating_slots():
+      reg_update = opt.minimize(reg_loss)
+    total_update = tf.group([loss_update, reg_update])
+
+    # ...
+
+    sess.run(total_update, ...)
+  """
+
+  def __init__(self,
+               learning_rate,
+               initial_accumulator_value=0.1,
+               use_locking=False,
+               name="RegAdagrad"):
+    super(RegAdagradOptimizer, self).__init__(
+        learning_rate,
+        initial_accumulator_value=initial_accumulator_value,
+        use_locking=use_locking,
+        name=name)
+    self._should_update_slots = True
+
+  @tf_contextlib.contextmanager
+  def avoid_updating_slots(self):
+    old = self._should_update_slots
+    self._should_update_slots = False
+    try:
+      yield
+    finally:
+      self._should_update_slots = old
+
+  def _apply_dense(self, grad, var):
+    acc = self.get_slot(var, "accumulator")
+    return training_ops.apply_adagrad(
+        var,
+        acc,
+        math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking,
+        update_slots=self._should_update_slots)
+
+  def _resource_apply_dense(self, grad, var, update_slots=True):
+    acc = self.get_slot(var, "accumulator")
+    return training_ops.resource_apply_adagrad(
+        var.handle,
+        acc.handle,
+        math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking,
+        update_slots=self._should_update_slots)
+
+  def _apply_sparse(self, grad, var, update_slots=True):
+    acc = self.get_slot(var, "accumulator")
+    return training_ops.sparse_apply_adagrad(
+        var,
+        acc,
+        math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
+        grad.values,
+        grad.indices,
+        use_locking=self._use_locking,
+        update_slots=self._should_update_slots)
+
+  def _resource_apply_sparse(self, grad, var, indices, update_slots=True):
+    acc = self.get_slot(var, "accumulator")
+    return training_ops.resource_sparse_apply_adagrad(
+        var.handle,
+        acc.handle,
+        math_ops.cast(self._learning_rate_tensor, grad.dtype),
+        grad,
+        indices,
+        use_locking=self._use_locking,
+        update_slots=self._should_update_slots)
diff --git a/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py
new file mode 100644
index 0000000000..ea56e1646a
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py
@@ -0,0 +1,343 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for Regreg_adagrad_optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import reg_adagrad_optimizer
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class RegAdagradOptimizerTest(test.TestCase):
+
+  def doTestBasic(self, use_locking=False, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+          var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        else:
+          var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+          var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer(
+            3.0, initial_accumulator_value=0.1, use_locking=use_locking)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          ada_update.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+  def testBasic(self):
+    self.doTestBasic(use_locking=False)
+
+  def testBasicResource(self):
+    self.doTestBasic(use_locking=False, use_resource=True)
+
+  def testBasicLocked(self):
+    self.doTestBasic(use_locking=True)
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable(
+            [[1.0, 2.0], [3.0, 4.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = reg_adagrad_optimizer.RegAdagradOptimizer(1.0).minimize(loss)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0], [3.0, 4.0]],
+                                           var0.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[0, 1], [3, 4]], var0.eval(), atol=0.01)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer(
+            constant_op.constant(3.0), initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          ada_update.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+  def testSparseBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]), constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([2, 1]))
+        ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer(
+            3.0, initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([[1.0], [2.0]], var0.eval())
+        self.assertAllClose([[3.0], [4.0]], var1.eval())
+        # Run 3 step of sgd
+        for _ in range(3):
+          ada_update.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([[-1.6026098728179932], [2.0]]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([[3.0], [3.715679168701172]]), var1.eval())
+
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]), constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant([0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([2, 1]))
+        repeated_update = reg_adagrad_optimizer.RegAdagradOptimizer(
+            3.0).apply_gradients([(grad_repeated_index,
+                                   repeated_index_update_var)])
+        aggregated_update = reg_adagrad_optimizer.RegAdagradOptimizer(
+            3.0).apply_gradients([(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  def testSparseRepeatedIndicesResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var_repeated = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype)
+        loss_repeated = math_ops.reduce_sum(
+            embedding_ops.embedding_lookup(var_repeated, [0, 0]))
+        var_aggregated = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype)
+        loss_aggregated = 2 * math_ops.reduce_sum(
+            embedding_ops.embedding_lookup(var_aggregated, [0]))
+        update_op_repeated = reg_adagrad_optimizer.RegAdagradOptimizer(
+            2.0).minimize(loss_repeated)
+        update_op_aggregated = reg_adagrad_optimizer.RegAdagradOptimizer(
+            2.0).minimize(loss_aggregated)
+        variables.global_variables_initializer().run()
+        self.assertAllCloseAccordingToType(var_repeated.eval(),
+                                           var_aggregated.eval())
+        for _ in range(3):
+          update_op_repeated.run()
+          update_op_aggregated.run()
+          self.assertAllCloseAccordingToType(var_repeated.eval(),
+                                             var_aggregated.eval())
+
+  def testSparseStability(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        shape = [1, 6]
+        var0 = variables.Variable(
+            [[
+                0.00872496, -0.106952, 0.110467, 0.226505, -0.0147257,
+                -0.0105945
+            ]],
+            dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(
+                [[
+                    -5.91278e-05, 5.31673e-05, -2.5779e-06, 4.29153e-05,
+                    -8.4877e-05, -9.48906e-05
+                ]],
+                shape=shape,
+                dtype=dtype), constant_op.constant([0]),
+            constant_op.constant(shape))
+        ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer(
+            1.0, initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(zip([grads0], [var0]))
+        self.assertEqual(["accumulator"], ada_opt.get_slot_names())
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        init = variables.global_variables_initializer()
+        for _ in range(100):
+          init.run()
+          ada_update.run()
+          self.assertAllCloseAccordingToType(
+              np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), slot0.eval())
+          self.assertAllCloseAccordingToType(
+              np.array([[
+                  0.00891194, -0.10712013, 0.11047515, 0.22636929, -0.0144573,
+                  -0.01029443
+              ]]), var0.eval())
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer(3.0)
+        # Apply the optimizer twice.  Both applications will use
+        # the same accums.
+        ada_update1 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        ada_update2 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        self.assertEqual(["accumulator"], ada_opt.get_slot_names())
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = ada_opt.get_slot(var1, "accumulator")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values.
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Mix the first and the second adagrad for 3 steps.
+        ada_update1.run()
+        ada_update2.run()
+        ada_update1.run()
+        # Validate updated params (the same as with only 1 RegAdagrad).
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+  def testDynamicShapeVariable_Ok(self):
+    with self.test_session():
+      v = variable_scope.get_variable(
+          "v", initializer=constant_op.constant(1.), validate_shape=False)
+      self.assertFalse(v.shape.is_fully_defined())
+      # Creating optimizer should cause no exception.
+      reg_adagrad_optimizer.RegAdagradOptimizer(
+          3.0, initial_accumulator_value=0.1)
+
+  def testSkipUpdatingSlots(self):
+    iav = 0.130005  # A value that works with float16
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer(
+            3.0, initial_accumulator_value=iav)
+        # Apply the optimizer twice.  Both applications will use
+        # the same accums.
+        with ada_opt.avoid_updating_slots():
+          ada_update = ada_opt.apply_gradients(
+              zip([grads0, grads1], [var0, var1]))
+        self.assertEqual(["accumulator"], ada_opt.get_slot_names())
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = ada_opt.get_slot(var1, "accumulator")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values.
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Mix the first and the second adagrad for 3 steps.
+        for _ in range(3):
+          ada_update.run()
+        # Validate that ada_opt's slots are not updated.
+        self.assertAllCloseAccordingToType(np.array([iav, iav]), slot0.eval())
+        self.assertAllCloseAccordingToType(np.array([iav, iav]), slot1.eval())
+
+  def testSparseSkipUpdatingSlots(self):
+    iav = 0.130005  # A value that works with float16
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]), constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([2, 1]))
+        ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer(
+            3.0, initial_accumulator_value=iav)
+        with ada_opt.avoid_updating_slots():
+          ada_update = ada_opt.apply_gradients(
+              zip([grads0, grads1], [var0, var1]))
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = ada_opt.get_slot(var1, "accumulator")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([[1.0], [2.0]], var0.eval())
+        self.assertAllClose([[3.0], [4.0]], var1.eval())
+        # Run 3 step of sgd
+        for _ in range(3):
+          ada_update.run()
+        # Validate that ada_opt's slots are not updated.
+        self.assertAllCloseAccordingToType(
+            np.array([[iav], [iav]]), slot0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([[iav], [iav]]), slot1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
-- 
GitLab


From e36ebcc88f0831c9fc16d0f5b060d076af8c0849 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Tue, 24 Apr 2018 13:58:37 -0700
Subject: [PATCH 379/450] Revert #18251 due to the following issue: - calling
 convolution with args instead of kwargs from convolutionXd breaks when called
 within arg_scope. - intentional use cases trigger the added dimension error.

PiperOrigin-RevId: 194140820
---
 .../contrib/layers/python/layers/layers.py    | 142 +-----------------
 .../layers/python/layers/layers_test.py       |  15 +-
 2 files changed, 7 insertions(+), 150 deletions(-)

diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 2f3e57653c..25c3b1e7ea 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -932,8 +932,7 @@ def convolution(inputs,
                 variables_collections=None,
                 outputs_collections=None,
                 trainable=True,
-                scope=None,
-                conv_dims=None):
+                scope=None):
   """Adds an N-D convolution followed by an optional batch_norm layer.
 
   It is required that 1 <= N <= 3.
@@ -994,10 +993,6 @@ def convolution(inputs,
     trainable: If `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     scope: Optional scope for `variable_scope`.
-    conv_dims: Optional convolution dimensionality, when set it would use the
-      corresponding convolution (e.g. 2 for Conv 2D, 3 for Conv 3D, ..). When
-      leaved to None it would select the convolution dimensionality based on
-      the input rank (i.e. Conv ND, with N = input_rank - 2).
 
   Returns:
     A tensor representing the output of the operation.
@@ -1020,9 +1015,6 @@ def convolution(inputs,
     inputs = ops.convert_to_tensor(inputs)
     input_rank = inputs.get_shape().ndims
 
-    if conv_dims is not None and conv_dims + 2 != input_rank:
-      raise ValueError('Convolution expects input with rank %d, got %d' %
-                       (conv_dims + 2, input_rank))
     if input_rank == 3:
       layer_class = convolutional_layers.Convolution1D
     elif input_rank == 4:
@@ -1069,134 +1061,10 @@ def convolution(inputs,
       outputs = activation_fn(outputs)
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
 
-@add_arg_scope
-def convolution1d(inputs,
-                  num_outputs,
-                  kernel_size,
-                  stride=1,
-                  padding='SAME',
-                  data_format=None,
-                  rate=1,
-                  activation_fn=nn.relu,
-                  normalizer_fn=None,
-                  normalizer_params=None,
-                  weights_initializer=initializers.xavier_initializer(),
-                  weights_regularizer=None,
-                  biases_initializer=init_ops.zeros_initializer(),
-                  biases_regularizer=None,
-                  reuse=None,
-                  variables_collections=None,
-                  outputs_collections=None,
-                  trainable=True,
-                  scope=None):
-  return convolution(inputs,
-                     num_outputs,
-                     kernel_size,
-                     stride,
-                     padding,
-                     data_format,
-                     rate,
-                     activation_fn,
-                     normalizer_fn,
-                     normalizer_params,
-                     weights_initializer,
-                     weights_regularizer,
-                     biases_initializer,
-                     biases_regularizer,
-                     reuse,
-                     variables_collections,
-                     outputs_collections,
-                     trainable,
-                     scope,
-                     conv_dims=1)
-
-convolution1d.__doc__ = convolution.__doc__
 
-@add_arg_scope
-def convolution2d(inputs,
-                  num_outputs,
-                  kernel_size,
-                  stride=1,
-                  padding='SAME',
-                  data_format=None,
-                  rate=1,
-                  activation_fn=nn.relu,
-                  normalizer_fn=None,
-                  normalizer_params=None,
-                  weights_initializer=initializers.xavier_initializer(),
-                  weights_regularizer=None,
-                  biases_initializer=init_ops.zeros_initializer(),
-                  biases_regularizer=None,
-                  reuse=None,
-                  variables_collections=None,
-                  outputs_collections=None,
-                  trainable=True,
-                  scope=None):
-  return convolution(inputs,
-                     num_outputs,
-                     kernel_size,
-                     stride,
-                     padding,
-                     data_format,
-                     rate,
-                     activation_fn,
-                     normalizer_fn,
-                     normalizer_params,
-                     weights_initializer,
-                     weights_regularizer,
-                     biases_initializer,
-                     biases_regularizer,
-                     reuse,
-                     variables_collections,
-                     outputs_collections,
-                     trainable,
-                     scope,
-                     conv_dims=2)
-
-convolution2d.__doc__ = convolution.__doc__
+convolution2d = convolution
+convolution3d = convolution
 
-@add_arg_scope
-def convolution3d(inputs,
-                  num_outputs,
-                  kernel_size,
-                  stride=1,
-                  padding='SAME',
-                  data_format=None,
-                  rate=1,
-                  activation_fn=nn.relu,
-                  normalizer_fn=None,
-                  normalizer_params=None,
-                  weights_initializer=initializers.xavier_initializer(),
-                  weights_regularizer=None,
-                  biases_initializer=init_ops.zeros_initializer(),
-                  biases_regularizer=None,
-                  reuse=None,
-                  variables_collections=None,
-                  outputs_collections=None,
-                  trainable=True,
-                  scope=None):
-  return convolution(inputs,
-                     num_outputs,
-                     kernel_size,
-                     stride,
-                     padding,
-                     data_format,
-                     rate,
-                     activation_fn,
-                     normalizer_fn,
-                     normalizer_params,
-                     weights_initializer,
-                     weights_regularizer,
-                     biases_initializer,
-                     biases_regularizer,
-                     reuse,
-                     variables_collections,
-                     outputs_collections,
-                     trainable,
-                     scope,
-                     conv_dims=3)
-
-convolution3d.__doc__ = convolution.__doc__
 
 @add_arg_scope
 def convolution2d_in_plane(
@@ -1543,7 +1411,7 @@ def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None):
   Args:
      tensor: An `int` `Tensor` to be converted to a `Sparse`.
      eos_token: An integer.
-       It is part of the target label that signifies the end of a sentence.
+       It is part of the target label that signfies the end of a sentence.
      outputs_collections: Collection to add the outputs.
      scope: Optional scope for name_scope.
   """
@@ -1687,7 +1555,7 @@ def _inner_flatten(inputs, new_rank, output_collections=None, scope=None):
     output_collections: Collection to which the outputs will be added.
     scope: Optional scope for `name_scope`.
   Returns:
-    A `Tensor` or `SparseTensor` containing the same values as `inputs`, but
+    A `Tensor` or `SparseTensor` conataining the same values as `inputs`, but
     with innermost dimensions flattened to obtain rank `new_rank`.
 
   Raises:
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index b01fd5d5c9..997f910a2a 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -310,17 +310,6 @@ class BiasAddTest(test.TestCase):
 
 class ConvolutionTest(test.TestCase):
 
-  def testInvalidShape(self):
-    with self.test_session():
-      images_2d = random_ops.random_uniform((5, 7, 9, 3), seed=1)
-      with self.assertRaisesRegexp(
-          ValueError, 'Convolution expects input with rank 5, got 4'):
-        layers_lib.convolution3d(images_2d, 32, 3)
-      images_3d = random_ops.random_uniform((5, 6, 7, 9, 3), seed=1)
-      with self.assertRaisesRegexp(
-          ValueError, 'Convolution expects input with rank 4, got 5'):
-        layers_lib.convolution2d(images_3d, 32, 3)
-
   def testInvalidDataFormat(self):
     height, width = 7, 9
     with self.test_session():
@@ -3166,7 +3155,7 @@ class RepeatTests(test.TestCase):
     with self.test_session():
       images = np.random.uniform(size=(5, height, width, 3)).astype(np.float32)
       output = _layers.repeat(images, 3, layers_lib.conv2d, 32, [3, 3])
-      self.assertEqual(output.op.name, 'Repeat/convolution2d_3/Relu')
+      self.assertEqual(output.op.name, 'Repeat/convolution_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 32])
 
   def testRepeatWithScope(self):
@@ -3760,7 +3749,7 @@ class StackTests(test.TestCase):
           layers_lib.convolution2d, [10, 20, 30],
           kernel_size=[3, 3],
           padding='SAME')
-      self.assertEqual(output.op.name, 'Stack/convolution2d_3/Relu')
+      self.assertEqual(output.op.name, 'Stack/convolution_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 30])
 
   def testStackWithScope(self):
-- 
GitLab


From 7d1fe156d79cad6818a443d3e9473dd6abd4ab56 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Tue, 24 Apr 2018 14:26:21 -0700
Subject: [PATCH 380/450] shape_tuple in array_ops.stack

PiperOrigin-RevId: 194145557
---
 tensorflow/python/ops/array_ops.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index ceeabe090d..aba8beb3f4 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -935,9 +935,9 @@ def stack(values, axis=0, name="stack"):
     except (TypeError, ValueError):
       pass  # Input list contains non-constant tensors
 
-  value_shape = ops.convert_to_tensor(values[0], name=name).get_shape()
-  if value_shape.ndims is not None:
-    expanded_num_dims = value_shape.ndims + 1
+  value_shape = ops.convert_to_tensor(values[0], name=name)._shape_tuple()  # pylint: disable=protected-access
+  if value_shape is not None:
+    expanded_num_dims = len(value_shape) + 1
     if axis < -expanded_num_dims or axis >= expanded_num_dims:
       raise ValueError("axis = %d not in [%d, %d)" % (axis, -expanded_num_dims,
                                                       expanded_num_dims))
-- 
GitLab


From 1c9493f1b6aa56653b018ecf25af7040317fbb1b Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Tue, 24 Apr 2018 14:32:39 -0700
Subject: [PATCH 381/450] Run shape inference directly on the graphdef instead
 of building an intermediate graph.

PiperOrigin-RevId: 194146713
---
 tensorflow/core/grappler/costs/BUILD          |   2 +
 .../core/grappler/costs/graph_properties.cc   | 554 +++++++++---------
 .../core/grappler/costs/graph_properties.h    |  26 +-
 .../grappler/costs/graph_properties_test.cc   |   6 +
 tensorflow/core/grappler/graph_view.cc        |  49 ++
 tensorflow/core/grappler/graph_view.h         |  36 +-
 6 files changed, 373 insertions(+), 300 deletions(-)

diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index ddbf7f3697..35f11eac29 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -42,6 +42,8 @@ cc_library(
     deps = [
         ":utils",
         "//tensorflow/core/grappler/utils:topological_sort",
+        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:op_types",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index ca30ad83a0..e3c6c40306 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -19,10 +19,13 @@ limitations under the License.
 #include <unordered_map>
 #include <unordered_set>
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/costs/utils.h"
+#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -253,16 +256,16 @@ typename DisjointSet<Handle>::Rep* DisjointSet<Handle>::Find(Handle value) {
   return root;
 }
 
-bool IsQueue(const Node& node) {
-  return str_util::EndsWith(node.type_string(), "QueueV2");
+bool IsQueue(const NodeDef& node) {
+  return str_util::EndsWith(node.op(), "QueueV2");
 }
 
 // Returns true if the node is an Enter op AND its input is a Queue.
-bool IsEnterWithQueue(const Node& node) {
-  if (node.IsEnter()) {
-    const Node* in_node;
-    TF_CHECK_OK(node.input_node(0, &in_node));
-    return IsQueue(*in_node);
+bool IsEnterWithQueue(const NodeDef& node, const GraphView& graph) {
+  if (IsEnter(node)) {
+    GraphView::InputPort input(&node, 0);
+    GraphView::OutputPort fanin = graph.GetRegularFanin(input);
+    return IsQueue(*fanin.node);
   }
   return false;
 }
@@ -279,8 +282,9 @@ bool HasAnyUnknownDimensions(const TensorShapeProto& proto) {
   return false;
 }
 
+// This really should be done in an external debugging tool
 void VerboseLogUnknownDimensionSources(
-    const Graph& graph,
+    const GraphDef& graph,
     const std::map<string, std::vector<OpInfo::TensorProperties>>&
         input_properties_map,
     const std::map<string, std::vector<OpInfo::TensorProperties>>&
@@ -295,17 +299,13 @@ void VerboseLogUnknownDimensionSources(
   // do not have any unknown dimensions in their inputs, but
   // we have some unknown dimensions in their outputs.
   std::map<string, int> op_to_count;
-  for (const Node* const node : graph.nodes()) {
-    if (node->num_outputs() == 0) {
-      continue;
-    }
-
-    const auto& input_properties = input_properties_map.at(node->name());
-    const auto& output_properties = output_properties_map.at(node->name());
+  for (const NodeDef& node : graph.node()) {
+    const auto& input_properties = input_properties_map.at(node.name());
+    const auto& output_properties = output_properties_map.at(node.name());
 
     bool has_unknown_inputs = false;
-    for (int i = 0; i < node->num_inputs(); ++i) {
-      if (HasAnyUnknownDimensions(input_properties[i].shape())) {
+    for (const auto& input_prop : input_properties) {
+      if (HasAnyUnknownDimensions(input_prop.shape())) {
         has_unknown_inputs = true;
         break;
       }
@@ -315,26 +315,24 @@ void VerboseLogUnknownDimensionSources(
       continue;
     }
 
-    for (int i = 0; i < node->num_outputs(); ++i) {
-      if (HasAnyUnknownDimensions(output_properties[i].shape())) {
+    for (const auto& output_prop : output_properties) {
+      if (HasAnyUnknownDimensions(output_prop.shape())) {
         string inputs = "input_shapes=[";
-        for (int i = 0; i < node->num_inputs(); ++i) {
-          inputs +=
-              PartialTensorShape::DebugString(input_properties[i].shape());
+        for (const auto& input_prop : input_properties) {
+          inputs += PartialTensorShape::DebugString(input_prop.shape());
         }
         inputs += "]";
 
         string outputs = "output_shapes=[";
-        for (int i = 0; i < node->num_outputs(); ++i) {
-          outputs +=
-              PartialTensorShape::DebugString(output_properties[i].shape());
+        for (const auto& output_prop : output_properties) {
+          outputs += PartialTensorShape::DebugString(output_prop.shape());
         }
         outputs += "]";
 
-        VLOG(2) << "Node: " << node->name() << ", Op: " << node->def().op()
-                << ", " << inputs << ", " << outputs;
+        VLOG(2) << "Node: " << node.name() << ", Op: " << node.op() << ", "
+                << inputs << ", " << outputs;
 
-        op_to_count[node->def().op()]++;
+        op_to_count[node.op()]++;
 
         // don't log again for this node
         break;
@@ -357,13 +355,13 @@ void VerboseLogUnknownDimensionSources(
 // information is refined.
 class TopoQueue {
  public:
-  explicit TopoQueue(const std::unordered_map<const Node*, int>& topo_order)
+  explicit TopoQueue(const std::unordered_map<const NodeDef*, int>& topo_order)
       : queue_(CompareNodes(topo_order)) {}
-  void push(const Node* n) { queue_.insert(n); }
-  const Node* pop() {
+  void push(const NodeDef* n) { queue_.insert(n); }
+  const NodeDef* pop() {
     CHECK(!empty());
     auto it = queue_.begin();
-    const Node* n = *it;
+    const NodeDef* n = *it;
     queue_.erase(it);
     return n;
   }
@@ -376,16 +374,16 @@ class TopoQueue {
   // use their id to ensure they're sorted topologically.
   struct CompareNodes {
     explicit CompareNodes(
-        const std::unordered_map<const Node*, int>& topo_ordering)
+        const std::unordered_map<const NodeDef*, int>& topo_ordering)
         : topo_order(topo_ordering) {}
-    bool operator()(const Node* lhs, const Node* rhs) const {
+    bool operator()(const NodeDef* lhs, const NodeDef* rhs) const {
       return topo_order.at(lhs) < topo_order.at(rhs);
     }
 
    private:
-    const std::unordered_map<const Node*, int>& topo_order;
+    const std::unordered_map<const NodeDef*, int>& topo_order;
   };
-  std::set<const Node*, CompareNodes> queue_;
+  std::set<const NodeDef*, CompareNodes> queue_;
 };
 
 // Merge and relax symbolic shapes.
@@ -396,22 +394,41 @@ class TopoQueue {
 class SymbolicShapeRefiner {
  public:
   explicit SymbolicShapeRefiner(
-      const GraphDef& graph,
+      const GraphView& graph,
       const std::unordered_map<string, std::unordered_set<int>>& fed_ports)
-      : function_library_(OpRegistry::Global(), graph.library()),
+      : graph_(graph),
+        function_library_(OpRegistry::Global(), graph.GetGraph()->library()),
         fed_ports_(fed_ports) {
-    graph_def_version_ = graph.versions().producer();
-    node_to_context_.reserve(graph.node_size());
+    graph_def_version_ = graph.GetGraph()->versions().producer();
+    node_to_context_.reserve(graph.GetGraph()->node_size());
+  }
+
+  const GraphView& graph() const { return graph_; }
+
+  struct NodeContext {
+    const OpRegistrationData* op_data;
+    DataTypeVector input_types;
+    DataTypeVector output_types;
+    std::unique_ptr<InferenceContext> inference_context;
+    std::vector<ShapeHandle> output_tensors_as_shapes;
+  };
+
+  NodeContext* GetNodeContext(const NodeDef* node) {
+    auto it = node_to_context_.find(node);
+    if (it == node_to_context_.end()) {
+      return nullptr;
+    }
+    return &it->second;
   }
 
-  InferenceContext* GetContext(const Node* node) {
+  InferenceContext* GetContext(const NodeDef* node) {
     auto it = node_to_context_.find(node);
     if (it == node_to_context_.end()) {
       return nullptr;
     }
     return it->second.inference_context.get();
   }
-  Status UpdateNode(const Node* node, bool relax, bool* refined) {
+  Status UpdateNode(const NodeDef* node, bool relax, bool* refined) {
     NodeContext* node_context = GetNodeContext(node);
     if (node_context == nullptr) {
       TF_RETURN_IF_ERROR(AddNode(node));
@@ -421,82 +438,84 @@ class SymbolicShapeRefiner {
     // Check if the shapes of the nodes in the fan-in of this node have changed,
     // and if they have, update the node input shapes.
     InferenceContext* inference_context = node_context->inference_context.get();
-    std::vector<Tensor> const_values(node->num_inputs());
-    std::vector<const Tensor*> input_tensors(node->num_inputs(), nullptr);
-    std::vector<ShapeHandle> input_tensors_as_shapes(node->num_inputs());
-
-    for (const Edge* e : node->in_edges()) {
-      if (e->IsControlEdge()) continue;
-
-      int dst_input = e->dst_input();
-      int src_output = e->src_output();
-
-      Node* input = e->src();
-      NodeContext* c = GetNodeContext(input);
-      if (c == nullptr) {
-        return errors::FailedPrecondition(
-            "Input ", dst_input, " ('", input->name(), "') for '", node->name(),
-            "' was not previously added to ShapeRefiner.");
-      }
+    std::vector<Tensor> const_values(inference_context->num_inputs());
+    std::vector<const Tensor*> input_tensors(inference_context->num_inputs(),
+                                             nullptr);
+    std::vector<ShapeHandle> input_tensors_as_shapes(
+        inference_context->num_inputs());
+
+    for (int dst_input = 0; dst_input < inference_context->num_inputs();
+         ++dst_input) {
+      GraphView::InputPort port(node, dst_input);
+      for (const GraphView::OutputPort fanin : graph_.GetFanin(port)) {
+        int src_output = fanin.port_id;
+        const NodeDef* input = fanin.node;
+        NodeContext* c = GetNodeContext(input);
+        if (c == nullptr) {
+          return errors::FailedPrecondition(
+              "Input ", dst_input, " ('", input->name(), "') for '",
+              node->name(), "' was not previously added to ShapeRefiner.");
+        }
 
-      if (input->IsConstant()) {
-        // Convert constant value into tensors.
-        if (const_values[dst_input].FromProto(
-                input->def().attr().at("value").tensor())) {
-          input_tensors[dst_input] = &const_values[dst_input];
-          // Integer tensors of rank one can also be interpreted as a shape
-          // provided all their values are >= -1.
-          if (const_values[dst_input].dims() == 1 &&
-              (const_values[dst_input].dtype() == DT_INT32 ||
-               const_values[dst_input].dtype() == DT_INT64)) {
-            ShapeHandle tensor_shape = inference_context->Vector(
-                const_values[dst_input].NumElements());
-            ShapeHandle shp;
-            if (inference_context
-                    ->MakeShapeFromTensor(input_tensors[dst_input],
-                                          tensor_shape, &shp)
-                    .ok()) {
-              input_tensors_as_shapes[dst_input] = shp;
+        if (IsConstant(*input)) {
+          // Convert constant value into tensors.
+          if (const_values[dst_input].FromProto(
+                  input->attr().at("value").tensor())) {
+            input_tensors[dst_input] = &const_values[dst_input];
+            // Integer tensors of rank one can also be interpreted as a shape
+            // provided all their values are >= -1.
+            if (const_values[dst_input].dims() == 1 &&
+                (const_values[dst_input].dtype() == DT_INT32 ||
+                 const_values[dst_input].dtype() == DT_INT64)) {
+              ShapeHandle tensor_shape = inference_context->Vector(
+                  const_values[dst_input].NumElements());
+              ShapeHandle shp;
+              if (inference_context
+                      ->MakeShapeFromTensor(input_tensors[dst_input],
+                                            tensor_shape, &shp)
+                      .ok()) {
+                input_tensors_as_shapes[dst_input] = shp;
+              }
             }
           }
         }
-      }
 
-      if (c->output_tensors_as_shapes.size() > src_output) {
-        input_tensors_as_shapes[dst_input] =
-            c->output_tensors_as_shapes[src_output];
-      }
-
-      DCHECK_GE(dst_input, 0);
-      if (!*refined && !inference_context->input(dst_input).SameHandle(
-                           c->inference_context->output(src_output))) {
-        *refined = true;
-      }
-      inference_context->SetInput(dst_input,
-                                  c->inference_context->output(src_output));
-
-      if (!*refined &&
-          inference_context->requested_input_tensor_as_partial_shape(
-              dst_input)) {
-        // The input value may have changed. Since we have no way to know if
-        // that's indeed the case, err on the safe side.
-        *refined = true;
-      }
-
-      // Also propagate handle shape and dtype of edges which are carrying
-      // resource handles.
-      if (e->src()->output_type(src_output) == DT_RESOURCE) {
-        auto* outputs =
-            c->inference_context->output_handle_shapes_and_types(src_output);
-        if (!outputs) continue;
-        auto* inputs =
-            inference_context->input_handle_shapes_and_types(dst_input);
+        if (c->output_tensors_as_shapes.size() > src_output) {
+          input_tensors_as_shapes[dst_input] =
+              c->output_tensors_as_shapes[src_output];
+        }
 
-        if (!inputs || !EquivalentShapesAndTypes(*outputs, *inputs)) {
+        DCHECK_GE(dst_input, 0);
+        if (!*refined && !inference_context->input(dst_input).SameHandle(
+                             c->inference_context->output(src_output))) {
+          *refined = true;
+        }
+        inference_context->SetInput(dst_input,
+                                    c->inference_context->output(src_output));
+
+        if (!*refined &&
+            inference_context->requested_input_tensor_as_partial_shape(
+                dst_input)) {
+          // The input value may have changed. Since we have no way to know if
+          // that's indeed the case, err on the safe side.
           *refined = true;
         }
-        inference_context->set_input_handle_shapes_and_types(dst_input,
-                                                             *outputs);
+
+        // Also propagate handle shape and dtype of edges which are carrying
+        // resource handles.
+        if (node_context->input_types[dst_input] == DT_RESOURCE) {
+          auto* outputs =
+              c->inference_context->output_handle_shapes_and_types(src_output);
+          if (!outputs) continue;
+          auto* inputs =
+              inference_context->input_handle_shapes_and_types(dst_input);
+
+          if (!inputs || !EquivalentShapesAndTypes(*outputs, *inputs)) {
+            *refined = true;
+          }
+          inference_context->set_input_handle_shapes_and_types(dst_input,
+                                                               *outputs);
+        }
       }
     }
 
@@ -510,10 +529,10 @@ class SymbolicShapeRefiner {
         input_tensors_as_shapes);
 
     // Update the shapes of the outputs.
-    return InferShapes(node, node_context);
+    return InferShapes(*node, node_context);
   }
 
-  Status SetUnknownShape(const Node* node, int output_port) {
+  Status SetUnknownShape(const NodeDef* node, int output_port) {
     shape_inference::ShapeHandle shape =
         GetUnknownOutputShape(node, output_port);
     InferenceContext* ctx = GetContext(node);
@@ -525,7 +544,7 @@ class SymbolicShapeRefiner {
   }
 
   struct ShapeId {
-    const Node* node;
+    const NodeDef* node;
     int port_id;
     bool operator==(const ShapeId& other) const {
       return node == other.node && port_id == other.port_id;
@@ -533,12 +552,12 @@ class SymbolicShapeRefiner {
   };
   struct HashShapeId {
     std::size_t operator()(const ShapeId& shp) const {
-      return std::hash<const Node*>{}(shp.node) + shp.port_id;
+      return std::hash<const NodeDef*>{}(shp.node) + shp.port_id;
     }
   };
 
   struct DimId {
-    const Node* node;
+    const NodeDef* node;
     int port_id;
     int dim_index;
     bool operator==(const DimId& other) const {
@@ -549,13 +568,14 @@ class SymbolicShapeRefiner {
 
   struct HashDimId {
     std::size_t operator()(const DimId& dim) const {
-      return std::hash<const Node*>{}(dim.node) + dim.port_id + dim.dim_index;
+      return std::hash<const NodeDef*>{}(dim.node) + dim.port_id +
+             dim.dim_index;
     }
   };
 
   // Compute the shape of the tensors outputed by node 'node' at output port
   // 'port_index' as the intersection of shape1 and shape2.
-  ShapeHandle OutputAsIntersection(const Node* node, int port_index,
+  ShapeHandle OutputAsIntersection(const NodeDef* node, int port_index,
                                    ShapeHandle shape1, ShapeHandle shape2) {
     if (shape1.SameHandle(shape2)) {
       return shape1;
@@ -600,7 +620,7 @@ class SymbolicShapeRefiner {
 
   // Compute the shape of the tensors outputed by node 'node' at output port
   // 'port_index' as the union of shape1 and shape2.
-  ShapeHandle OutputAsUnion(const Node* node, int port_index,
+  ShapeHandle OutputAsUnion(const NodeDef* node, int port_index,
                             ShapeHandle shape1, ShapeHandle shape2) {
     if (shape1.SameHandle(shape2)) {
       return shape1;
@@ -670,20 +690,24 @@ class SymbolicShapeRefiner {
     return true;
   }
 
-  Status AddNode(const Node* node) {
+  Status AddNode(const NodeDef* node) {
+    NodeContext& node_ctx = node_to_context_[node];
+    TF_RETURN_IF_ERROR(function_library_.LookUp(node->op(), &node_ctx.op_data));
+
+    TF_RETURN_IF_ERROR(InOutTypesForNode(*node, node_ctx.op_data->op_def,
+                                         &node_ctx.input_types,
+                                         &node_ctx.output_types));
+
     // Create the inference context for this node.
-    std::vector<ShapeHandle> input_shapes(node->num_inputs());
+    const int num_inputs = node_ctx.input_types.size();
+    std::vector<ShapeHandle> input_shapes(num_inputs);
     std::vector<std::unique_ptr<std::vector<ShapeAndType>>>
-        input_handle_shapes_and_types(node->num_inputs());
-    std::vector<const Tensor*> input_tensors(node->num_inputs(), nullptr);
+        input_handle_shapes_and_types(num_inputs);
+    std::vector<const Tensor*> input_tensors(num_inputs, nullptr);
     std::vector<ShapeHandle> input_tensors_as_shapes;
 
-    NodeContext& node_ctx = node_to_context_[node];
-    TF_RETURN_IF_ERROR(
-        function_library_.LookUp(node->type_string(), &node_ctx.op_data));
-
     node_ctx.inference_context.reset(new InferenceContext(
-        graph_def_version_, &node->def(), node->op_def(), input_shapes,
+        graph_def_version_, node, node_ctx.op_data->op_def, input_shapes,
         input_tensors, input_tensors_as_shapes,
         std::move(input_handle_shapes_and_types)));
     const Status s = node_ctx.inference_context->construction_status();
@@ -696,7 +720,7 @@ class SymbolicShapeRefiner {
  private:
   // Return the one ShapeHandle used to denote a fully unknown shape for a node
   // output.
-  ShapeHandle GetUnknownOutputShape(const Node* node, int index) {
+  ShapeHandle GetUnknownOutputShape(const NodeDef* node, int index) {
     ShapeId id{node, index};
     auto it = unknown_shapes_.find(id);
     if (it != unknown_shapes_.end()) {
@@ -709,7 +733,8 @@ class SymbolicShapeRefiner {
   }
   // Return the one ShapeHandle used to denote a fully unknown dimension for a
   // node output.
-  DimensionHandle GetUnknownOutputDim(const Node* node, int index, int dim_id) {
+  DimensionHandle GetUnknownOutputDim(const NodeDef* node, int index,
+                                      int dim_id) {
     DimId id{node, index, dim_id};
     auto it = unknown_dims_.find(id);
     if (it != unknown_dims_.end()) {
@@ -721,31 +746,25 @@ class SymbolicShapeRefiner {
     return dim;
   }
 
-  struct NodeContext {
-    const OpRegistrationData* op_data;
-    std::unique_ptr<InferenceContext> inference_context;
-    std::vector<ShapeHandle> output_tensors_as_shapes;
-  };
-
-  Status InferShapes(const Node* node, NodeContext* c) {
+  Status InferShapes(const NodeDef& node, NodeContext* c) {
     InferenceContext* ic = c->inference_context.get();
 
-    auto it = fed_ports_.find(node->name());
+    auto it = fed_ports_.find(node.name());
     const bool is_fed = it != fed_ports_.end();
 
     // Propagate shape tensors unless the node is fed.
     // TODO(bsteiner) We should still propagate the shapes to the ports that
     // aren't fed in the case of a ShapeN node.
     if (!is_fed) {
-      if (node->type_string() == "Shape") {
+      if (IsShape(node)) {
         c->output_tensors_as_shapes.resize(1);
         c->output_tensors_as_shapes[0] = c->inference_context->input(0);
-      } else if (node->type_string() == "ShapeN") {
+      } else if (IsShapeN(node)) {
         c->output_tensors_as_shapes.resize(c->inference_context->num_inputs());
         for (int i = 0; i < c->inference_context->num_inputs(); ++i) {
           c->output_tensors_as_shapes[i] = c->inference_context->input(i);
         }
-      } else if (node->type_string() == "ConcatV2") {
+      } else if (node.op() == "ConcatV2") {
         bool valid = true;
         ShapeHandle result;
         for (int i = 0; i < ic->num_inputs() - 1; ++i) {
@@ -763,7 +782,7 @@ class SymbolicShapeRefiner {
           c->output_tensors_as_shapes.resize(1);
           c->output_tensors_as_shapes[0] = result;
         }
-      } else if (node->type_string() == "Slice") {
+      } else if (IsSlice(node)) {
         ShapeHandle input = ic->input_tensors_as_shapes()[0];
         bool valid = ic->RankKnown(input);
         const Tensor* slice_offset = ic->input_tensor(1);
@@ -800,22 +819,16 @@ class SymbolicShapeRefiner {
       // It is possible to feed node output ports with tensors of any shape: as
       // a result, the shape of a fed port is completely unknown.
       for (const int output_port : it->second) {
-        status.Update(SetUnknownShape(node, output_port));
+        status.Update(SetUnknownShape(&node, output_port));
       }
     }
     return status;
   }
 
-  NodeContext* GetNodeContext(const Node* node) {
-    auto it = node_to_context_.find(node);
-    if (it == node_to_context_.end()) {
-      return nullptr;
-    }
-    return &it->second;
-  }
-
+ private:
+  const GraphView& graph_;
   int graph_def_version_;
-  std::unordered_map<const Node*, NodeContext> node_to_context_;
+  std::unordered_map<const NodeDef*, NodeContext> node_to_context_;
   std::unordered_map<ShapeId, ShapeHandle, HashShapeId> unknown_shapes_;
   std::unordered_map<DimId, DimensionHandle, HashDimId> unknown_dims_;
   FunctionLibraryDefinition function_library_;
@@ -874,7 +887,7 @@ class SymbolicShapeManager {
 };
 
 Status GraphProperties::MergeEnqueueShapesAndTypes(
-    SymbolicShapeRefiner* shape_refiner, const Node* qnode,
+    SymbolicShapeRefiner* shape_refiner, const NodeDef* qnode,
     const std::vector<ShapeAndType>& shapes_and_types,
     std::vector<ShapeAndType>* queue_shapes_and_types) {
   if (shapes_and_types.size() != queue_shapes_and_types->size()) {
@@ -897,7 +910,7 @@ Status GraphProperties::MergeEnqueueShapesAndTypes(
 }
 
 Status GraphProperties::RelaxEnqueueShapesAndMergeTypes(
-    SymbolicShapeRefiner* shape_refiner, const Node* qnode,
+    SymbolicShapeRefiner* shape_refiner, const NodeDef* qnode,
     const std::vector<ShapeAndType>& shapes_and_types,
     std::vector<ShapeAndType>* queue_shapes_and_types) {
   if (shapes_and_types.size() != queue_shapes_and_types->size()) {
@@ -925,7 +938,7 @@ Status GraphProperties::RelaxEnqueueShapesAndMergeTypes(
 // inputs are UnknownShapes. So we need to ignore the input from NextIteration
 // nodes to propagate any known shape from the Merge node.
 Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
-                                        const Node* node, bool relax,
+                                        const NodeDef* node, bool relax,
                                         bool* new_shapes) const {
   InferenceContext* c = shape_refiner->GetContext(node);
   if (!c) {
@@ -942,25 +955,24 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
 
   ShapeHandle out;
   bool out_initialized = false;
-  for (const Edge* e : node->in_edges()) {
-    if (e->IsControlEdge()) {
-      continue;
-    }
+  for (const GraphView::Edge fanin :
+       shape_refiner->graph().GetFaninEdges(*node, false)) {
     // Skip back edges during the initial propagation phase. This is equivalent
     // to assuming that all the inputs to the merge nodes are fed by the same
     // shape, and will be corrected as needed in the relaxation phase.
-    if (!relax && e->src()->IsNextIteration()) {
+    if (!relax && IsNextIteration(*fanin.src.node)) {
       continue;
     }
 
-    InferenceContext* in = shape_refiner->GetContext(e->src());
+    InferenceContext* in = shape_refiner->GetContext(fanin.src.node);
     if (!relax && !in) {
       // Handling a loop for the first time, the back edge won't have any shape
       // info.
       continue;
     }
-    ShapeHandle input = in->output(e->src_output());
-    c->SetInput(e->dst_input(), input);
+    ShapeHandle input = in->output(fanin.src.port_id);
+    CHECK_EQ(fanin.tgt.node, node);
+    c->SetInput(fanin.tgt.port_id, input);
     if (!out_initialized) {
       out_initialized = true;
       out = input;
@@ -984,7 +996,7 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
 // Manually propagate the input shape for Enter nodes and update any Merge node
 // outputs.
 Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner,
-                                    const Node* node, bool relax,
+                                    const NodeDef* node, bool relax,
                                     bool* new_shapes) {
   auto enter_ctx = shape_refiner->GetContext(node);
   if (!enter_ctx) {
@@ -992,33 +1004,27 @@ Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner,
     enter_ctx = shape_refiner->GetContext(node);
   }
 
-  for (const Edge* e : node->in_edges()) {
-    if (e->IsControlEdge()) {
-      continue;
-    }
-    InferenceContext* in = shape_refiner->GetContext(e->src());
-    ShapeHandle input = in->output(e->src_output());
-    if (!enter_ctx->output(0).SameHandle(input)) {
-      if (relax) {
-        enter_ctx->RelaxInput(0, input);
-      } else {
-        enter_ctx->MergeInput(0, input);
-      }
-      enter_ctx->set_output(0, input);
-      *new_shapes = true;
-    }
+  GraphView::InputPort inp(node, 0);
+  GraphView::OutputPort fanin = shape_refiner->graph().GetRegularFanin(inp);
+
+  InferenceContext* in = shape_refiner->GetContext(fanin.node);
+  ShapeHandle input = in->output(fanin.port_id);
+  if (!enter_ctx->output(0).SameHandle(input)) {
+    enter_ctx->SetInput(0, input);
+    enter_ctx->set_output(0, input);
+    *new_shapes = true;
   }
   return Status::OK();
 }
 
-Status GraphProperties::UpdateShapes(
-    SymbolicShapeRefiner* shape_refiner, bool relax,
-    const Node* n, bool* new_shapes) const {
-  if (n->IsEnter()) {
+Status GraphProperties::UpdateShapes(SymbolicShapeRefiner* shape_refiner,
+                                     bool relax, const NodeDef* n,
+                                     bool* new_shapes) const {
+  if (IsEnter(*n)) {
     // The Enter shape function always forwards an UnknownShape, so do the right
     // thing here.
     TF_RETURN_IF_ERROR(UpdateEnter(shape_refiner, n, relax, new_shapes));
-  } else if (n->IsMerge()) {
+  } else if (IsMerge(*n)) {
     // Properly handle merge nodes.
     TF_RETURN_IF_ERROR(UpdateMergeNode(shape_refiner, n, relax, new_shapes));
   } else {
@@ -1028,7 +1034,7 @@ Status GraphProperties::UpdateShapes(
     if (updated) {
       // We want to avoid propagating through loops on the merge pass because
       // the shapes are not guaranteed to converge.
-      if (relax || !n->IsNextIteration()) {
+      if (relax || !IsNextIteration(*n)) {
         *new_shapes = true;
       }
     }
@@ -1039,8 +1045,8 @@ Status GraphProperties::UpdateShapes(
 // Propagates the shapes in the transitive fan-out of <new_shapes>.
 Status GraphProperties::PropagateShapes(
     SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes,
-    const std::unordered_map<const Node*, std::unordered_set<const Node*>>&
-        resources,
+    const std::unordered_map<const NodeDef*,
+                             std::unordered_set<const NodeDef*>>& resources,
     int num_loops) const {
   // Limit the number of iterations to prevent infinite loops in the presence of
   // incorrect shape functions. The algoritm should converge in at most
@@ -1062,15 +1068,13 @@ Status GraphProperties::PropagateShapes(
     int64 num_loop_iterations = 0;
     while (!new_shapes->empty() &&
            num_loop_iterations++ < max_loop_iterations) {
-      const Node* n = new_shapes->pop();
+      const NodeDef* n = new_shapes->pop();
       bool updated = false;
       TF_RETURN_IF_ERROR(UpdateShapes(shape_refiner, relax, n, &updated));
       if (updated) {
-        for (const Edge* e : n->out_edges()) {
-          if (!e->IsControlEdge()) {
-            const Node* fanout = e->dst();
-            new_shapes->push(fanout);
-          }
+        for (const GraphView::InputPort fanout :
+             shape_refiner->graph().GetFanouts(*n, false)) {
+          new_shapes->push(fanout.node);
         }
       }
     }
@@ -1093,10 +1097,11 @@ Status GraphProperties::PropagateShapes(
 }
 
 Status GraphProperties::UpdateResource(
-    const Node* qnode, const std::unordered_set<const Node*>& queue_inputs,
+    const NodeDef* qnode,
+    const std::unordered_set<const NodeDef*>& queue_inputs,
     SymbolicShapeRefiner* shape_refiner, TopoQueue* new_shapes) {
   // Proceed only if qnode is a queue or an Enter with queue input.
-  if (!IsQueue(*qnode) && !IsEnterWithQueue(*qnode)) {
+  if (!IsQueue(*qnode) && !IsEnterWithQueue(*qnode, shape_refiner->graph())) {
     return Status::OK();
   }
   auto qctx = shape_refiner->GetContext(qnode);
@@ -1109,16 +1114,17 @@ Status GraphProperties::UpdateResource(
   // are in.
   std::vector<ShapeAndType> queue_shapes_and_types;
   for (const auto& node : queue_inputs) {
-    auto ctx = shape_refiner->GetContext(node);
+    auto ctx = shape_refiner->GetNodeContext(node);
     if (!ctx) {
       continue;
     }
     // TODO(bsteiner): handle EnqueueMany as well.
-    if (node->type_string().find("Enqueue") != std::string::npos &&
-        node->type_string().find("EnqueueMany") == std::string::npos) {
+    if (node->op().find("Enqueue") != std::string::npos &&
+        node->op().find("EnqueueMany") == std::string::npos) {
       std::vector<ShapeAndType> shapes_and_types;
-      for (int i = 1; i < ctx->num_inputs(); ++i) {
-        shapes_and_types.push_back({ctx->input(i), node->input_type(i)});
+      for (int i = 1; i < ctx->input_types.size(); ++i) {
+        shapes_and_types.push_back(
+            {ctx->inference_context->input(i), ctx->input_types[i]});
       }
       if (queue_shapes_and_types.empty()) {
         queue_shapes_and_types = shapes_and_types;
@@ -1134,11 +1140,9 @@ Status GraphProperties::UpdateResource(
                                                queue_shapes_and_types)) {
     qctx->set_output_handle_shapes_and_types(0, queue_shapes_and_types);
 
-    for (const Edge* e : qnode->out_edges()) {
-      if (!e->IsControlEdge()) {
-        const Node* fanout = e->dst();
-        new_shapes->push(fanout);
-      }
+    for (const GraphView::InputPort fanout :
+         shape_refiner->graph().GetFanouts(*qnode, false)) {
+      new_shapes->push(fanout.node);
     }
   }
 
@@ -1148,18 +1152,6 @@ Status GraphProperties::UpdateResource(
 Status GraphProperties::InferStatically(bool assume_valid_feeds) {
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item_.graph.library());
-  Graph graph(function_library);
-  graph_ = &graph;
-  ImportGraphDefOptions options;
-  // Graph optimization happens at the late stage of graph execution,
-  // when colocation constraints are already validated previously and
-  // the device placement of nodes has also completed, so there
-  // is no need to validate colocation constraints again.
-  options.validate_colocation_constraints = false;
-  options.validate_shape = false;
-  Status s = ImportGraphDef(options, item_.graph, &graph, nullptr);
-  TF_RETURN_IF_ERROR(s);
-
   std::unordered_map<string, std::unordered_set<int>> fed_ports;
   if (!assume_valid_feeds) {
     for (const auto& feed : item_.feed) {
@@ -1172,46 +1164,45 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
   std::unordered_map<const NodeDef*, int> topo_order;
   TF_RETURN_IF_ERROR(ComputeTopologicalOrder(item_.graph, &topo_order));
 
-  std::unordered_map<string, int> order_by_name;
-  for (const auto topo : topo_order) {
-    order_by_name[topo.first->name()] = topo.second;
-  }
+  GraphView graph_view(&item_.graph);
 
-  // List the resources and the nodes using them. Also collect the Enter and
-  // Merge nodes.
-  std::unordered_map<const Node*, int> graph_topo_order;
-  std::unordered_map<const Node*, std::unordered_set<const Node*>> resources;
-  std::unordered_set<const Node*> merge_nodes;
-  std::unordered_set<const Node*> fed_nodes;
-  std::unordered_set<const Node*> primary_inputs;
+  // List the resources and the nodes using them. Also collect the Merge nodes,
+  // fed nodes, and primary inputs.
+  std::unordered_map<const NodeDef*, std::unordered_set<const NodeDef*>>
+      resources;
+  std::unordered_set<const NodeDef*> merge_nodes;
+  std::unordered_set<const NodeDef*> fed_nodes;
+  std::unordered_set<const NodeDef*> primary_inputs;
   int num_loops = 0;
-  for (const Node* const node : graph.nodes()) {
-    auto it = order_by_name.find(node->name());
-    if (it == order_by_name.end()) {
-      continue;
-    }
-    graph_topo_order[node] = it->second;
-
-    for (int i = 0; i < node->num_inputs(); ++i) {
-      if (node->input_type(i) == DataType::DT_RESOURCE) {
-        const Node* resource;
-        TF_CHECK_OK(node->input_node(i, &resource));
-        resources[resource].insert(node);
-      }
-    }
-    if (node->num_inputs() == 0) {
-      primary_inputs.insert(node);
-    } else if (node->IsMerge()) {
-      merge_nodes.insert(node);
-    } else if (node->IsNextIteration()) {
+  for (const NodeDef& node : item_.graph.node()) {
+    if (NumNonControlInputs(node) == 0) {
+      primary_inputs.insert(&node);
+    } else if (IsMerge(node)) {
+      merge_nodes.insert(&node);
+    } else if (IsNextIteration(node)) {
       ++num_loops;
+    } else {
+      const OpRegistrationData* op_data;
+      TF_RETURN_IF_ERROR(function_library.LookUp(node.op(), &op_data));
+      DataTypeVector input_types;
+      DataTypeVector output_types;
+      TF_RETURN_IF_ERROR(InOutTypesForNode(node, op_data->op_def, &input_types,
+                                           &output_types));
+      for (int i = 0; i < input_types.size(); ++i) {
+        if (input_types[i] == DataType::DT_RESOURCE) {
+          GraphView::InputPort input(&node, i);
+          const GraphView::OutputPort resource =
+              graph_view.GetRegularFanin(input);
+          resources[resource.node].insert(&node);
+        }
+      }
     }
-    if (fed_ports.find(node->name()) != fed_ports.end()) {
-      fed_nodes.insert(node);
+    if (fed_ports.find(node.name()) != fed_ports.end()) {
+      fed_nodes.insert(&node);
     }
   }
 
-  SymbolicShapeRefiner refiner(item_.graph, fed_ports);
+  SymbolicShapeRefiner refiner(graph_view, fed_ports);
 
   // We propagate shapes through the graph in two phases. In the first phase, we
   // exclusively merge shapes but we do not propagate shapes through the
@@ -1219,19 +1210,19 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
   // we exclusively relax shapes and propagate shapes through loops until
   // reaching fixed point.
   for (int relax = 0; relax < 2; relax++) {
-    TopoQueue new_shapes(graph_topo_order);
+    TopoQueue new_shapes(topo_order);
     // Seed the propagation of shapes through merge nodes.
     if (relax) {
-      for (const Node* node : merge_nodes) {
+      for (const NodeDef* node : merge_nodes) {
         new_shapes.push(node);
       }
     }
     // Also seed the propagation of shapes in the fanout of primary inputs.
-    for (const Node* node : primary_inputs) {
+    for (const NodeDef* node : primary_inputs) {
       new_shapes.push(node);
     }
     // Also seed the propagation of shapes in the fanout of fed nodes.
-    for (const Node* node : fed_nodes) {
+    for (const NodeDef* node : fed_nodes) {
       new_shapes.push(node);
     }
     // Propagate shapes normally.
@@ -1242,14 +1233,14 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
   // Track shapes globally across the graph.
   SymbolicShapeManager shape_manager;
   bool found_error = false;
-  for (const Node* const node : graph.nodes()) {
-    auto node_ctx = refiner.GetContext(node);
+  for (const NodeDef& node : item_.graph.node()) {
+    auto node_ctx = refiner.GetContext(&node);
     if (!node_ctx) {
       continue;
     }
     // Skip any information that comes from fed nodes.
-    if (fed_ports.find(node->name()) != fed_ports.end()) {
-      VLOG(2) << "Skipping feed node shape: " << node->name();
+    if (fed_ports.find(node.name()) != fed_ports.end()) {
+      VLOG(2) << "Skipping feed node shape: " << node.name();
       continue;
     }
     for (const auto& merged_shapes : node_ctx->MergedShapes()) {
@@ -1273,61 +1264,56 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
     }
   }
 
-  for (const Node* const node : graph.nodes()) {
-    VLOG(3) << "Filling in graph properties for node: " << node->name();
-    auto ctx = refiner.GetContext(node);
+  for (const NodeDef& node : item_.graph.node()) {
+    VLOG(3) << "Filling in graph properties for node: " << node.name();
+    auto ctx = refiner.GetNodeContext(&node);
     if (!ctx) {
       continue;
     }
 
     // Fill input properties.
     {
-      CHECK_EQ(ctx->num_inputs(), node->num_inputs());
-      auto& input_properties = input_properties_[node->name()];
+      // CHECK_EQ(ctx->num_inputs(), node.num_inputs());
+      auto& input_properties = input_properties_[node.name()];
 
       // Should always be empty, node names in graph are supposed to be unique.
       CHECK_EQ(input_properties.size(), 0);
 
-      input_properties.resize(ctx->num_inputs());
-      for (int i = 0; i < ctx->num_inputs(); ++i) {
-        shape_manager.AsTensorProperties(ctx->input(i), node->input_type(i),
+      input_properties.resize(ctx->inference_context->num_inputs());
+      GraphView::InputPort input(&node, -1);
+      for (int i = 0; i < ctx->inference_context->num_inputs(); ++i) {
+        shape_manager.AsTensorProperties(ctx->inference_context->input(i),
+                                         ctx->input_types[i],
                                          &input_properties[i]);
-      }
-      for (const auto& edge : node->in_edges()) {
-        if (edge->IsControlEdge()) {
-          continue;
-        }
-        if (!edge->src()->IsConstant()) {
-          continue;
-        }
-        const int input_id = edge->dst_input();
-        if (input_id >= input_properties.size()) {
+        input.port_id = i;
+        GraphView::OutputPort fanin = graph_view.GetRegularFanin(input);
+        if (!IsConstant(*fanin.node)) {
           continue;
         }
-        const NodeDef& node = edge->src()->def();
-        const TensorProto& raw_val = node.attr().at("value").tensor();
-        *input_properties[input_id].mutable_value() = raw_val;
+        const TensorProto& raw_val = fanin.node->attr().at("value").tensor();
+        *input_properties[i].mutable_value() = raw_val;
       }
     }
 
     // Fill output properties.
     {
-      CHECK_EQ(ctx->num_outputs(), node->num_outputs());
-      auto& output_properties = output_properties_[node->name()];
+      // CHECK_EQ(ctx->num_outputs(), node->num_outputs());
+      auto& output_properties = output_properties_[node.name()];
 
       // Should always be empty, node names in graph are supposed to be unique.
       CHECK_EQ(output_properties.size(), 0);
 
-      output_properties.resize(ctx->num_outputs());
-      for (int i = 0; i < ctx->num_outputs(); ++i) {
-        shape_manager.AsTensorProperties(ctx->output(i), node->output_type(i),
+      output_properties.resize(ctx->inference_context->num_outputs());
+      for (int i = 0; i < ctx->inference_context->num_outputs(); ++i) {
+        shape_manager.AsTensorProperties(ctx->inference_context->output(i),
+                                         ctx->output_types[i],
                                          &output_properties[i]);
       }
     }
   }
 
   // Help trace the unknown dimensions to their origins.
-  VerboseLogUnknownDimensionSources(graph, input_properties_,
+  VerboseLogUnknownDimensionSources(item_.graph, input_properties_,
                                     output_properties_);
 
   return Status::OK();
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index a4e3031db1..485324c466 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 
 namespace tensorflow {
-class Graph;
 
 namespace grappler {
 
@@ -79,40 +78,41 @@ class GraphProperties {
   // Merges shapes <shapes_and_types>, determined from an EnqueueV2 node, into
   // <*queue_shapes_and_types>.
   static Status MergeEnqueueShapesAndTypes(
-      SymbolicShapeRefiner* shape_refiner, const Node* qnode,
+      SymbolicShapeRefiner* shape_refiner, const NodeDef* qnode,
       const std::vector<shape_inference::ShapeAndType>& shapes_and_types,
       std::vector<shape_inference::ShapeAndType>* queue_shapes_and_types);
   // Relaxes shapes <shapes_and_types>, determined from an EnqueueV2 node, into
   // <*queue_shapes_and_types>.
   static Status RelaxEnqueueShapesAndMergeTypes(
-      SymbolicShapeRefiner* shape_refiner, const Node* qnode,
+      SymbolicShapeRefiner* shape_refiner, const NodeDef* qnode,
       const std::vector<shape_inference::ShapeAndType>& shapes_and_types,
       std::vector<shape_inference::ShapeAndType>* queue_shapes_and_types);
 
   // Update the shapes for qnode. If output shapes of qnode have changed,
   // enqueue its fanout in 'new_shapes'.
   static Status UpdateResource(
-      const Node* qnode, const std::unordered_set<const Node*>& queue_inputs,
+      const NodeDef* qnode,
+      const std::unordered_set<const NodeDef*>& queue_inputs,
       SymbolicShapeRefiner* shape_refiner, TopoQueue* new_shapes);
 
   // Update the output shapes of a Merge node, and enqueue its fanout in
   // new_shapes if needed.
-  Status UpdateMergeNode(SymbolicShapeRefiner* shape_refiner, const Node* node,
-                         bool relax, bool* new_shapes) const;
+  Status UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
+                         const NodeDef* node, bool relax,
+                         bool* new_shapes) const;
   // Process the Enter node, and enqueue its fanout in new_shapes if needed.
   static Status UpdateEnter(SymbolicShapeRefiner* shape_refiner,
-                            const Node* node, bool relax, bool* new_shapes);
+                            const NodeDef* node, bool relax, bool* new_shapes);
   // Update the shapes for node 'n'. If output shapes for n have changed,
   // enqueue its fanout in 'new_shapes'.
-  Status UpdateShapes(
-      SymbolicShapeRefiner* shape_refiner, bool relax,
-      const Node* n, bool* new_shapes) const;
+  Status UpdateShapes(SymbolicShapeRefiner* shape_refiner, bool relax,
+                      const NodeDef* n, bool* new_shapes) const;
   // Propagate the shapes for the nodes enqueued in new_shapes and their
   // transitive fanout until a fixed point is reached.
   Status PropagateShapes(
       SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes,
-      const std::unordered_map<const Node*, std::unordered_set<const Node*>>&
-          resources,
+      const std::unordered_map<const NodeDef*,
+                               std::unordered_set<const NodeDef*>>& resources,
       int num_loops) const;
 
   // Data members
@@ -120,8 +120,6 @@ class GraphProperties {
   std::map<string, std::vector<OpInfo::TensorProperties>> input_properties_;
   std::map<string, std::vector<OpInfo::TensorProperties>> output_properties_;
   const std::vector<OpInfo::TensorProperties> missing_properties_;
-
-  Graph* graph_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 3de697bd37..afe334dfa2 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -955,6 +956,11 @@ TEST_F(GraphPropertiesTest, Performance) {
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "large_graph.pbtxt.html");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  TF_CHECK_OK(AddDefaultAttrsToGraphDef(
+      &item.graph,
+      FunctionLibraryDefinition(OpRegistry::Global(), item.graph.library()), 0,
+      true));
+
   GraphProperties properties(item);
   TF_CHECK_OK(properties.InferStatically(false));
 }
diff --git a/tensorflow/core/grappler/graph_view.cc b/tensorflow/core/grappler/graph_view.cc
index 0d3f94854b..3e448216f9 100644
--- a/tensorflow/core/grappler/graph_view.cc
+++ b/tensorflow/core/grappler/graph_view.cc
@@ -173,5 +173,54 @@ int GraphView::NumFanins(const NodeDef& node,
   return count;
 }
 
+std::unordered_set<GraphView::Edge, GraphView::HashEdge>
+GraphView::GetFanoutEdges(const NodeDef& node,
+                          bool include_controlled_edges) const {
+  std::unordered_set<Edge, HashEdge> result;
+  OutputPort port;
+  port.node = const_cast<NodeDef*>(&node);
+  const int first_port_id = include_controlled_edges ? -1 : 0;
+  auto it = num_regular_outputs_.find(&node);
+  const int last_port_id = (it != num_regular_outputs_.end()) ? it->second : -1;
+
+  for (int i = first_port_id; i <= last_port_id; ++i) {
+    port.port_id = i;
+    auto it = fanouts_.find(port);
+    if (it != fanouts_.end()) {
+      Edge fanout;
+      fanout.src.node = const_cast<NodeDef*>(&node);
+      fanout.src.port_id = i;
+      for (auto itr = it->second.begin(); itr != it->second.end(); ++itr) {
+        fanout.tgt = *itr;
+        result.insert(fanout);
+      }
+    }
+  }
+  return result;
+}
+
+std::unordered_set<GraphView::Edge, GraphView::HashEdge>
+GraphView::GetFaninEdges(const NodeDef& node,
+                         bool include_controlling_edges) const {
+  std::unordered_set<Edge, HashEdge> result;
+  for (int i = 0; i < node.input_size(); ++i) {
+    Edge fanin;
+    fanin.tgt.node = const_cast<NodeDef*>(&node);
+    fanin.tgt.port_id = i;
+    string fanin_name = ParseNodeName(node.input(i), &fanin.src.port_id);
+    if (fanin.src.port_id < 0) {
+      if (!include_controlling_edges) {
+        break;
+      }
+    }
+    auto it = nodes_.find(fanin_name);
+    if (it != nodes_.end()) {
+      fanin.src.node = it->second;
+      result.insert(fanin);
+    }
+  }
+  return result;
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index 173ce9c09c..c3baad0987 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -29,6 +29,8 @@ namespace grappler {
 class GraphView {
  public:
   struct Port {
+    Port() : node(nullptr), port_id(-1) {}
+    Port(NodeDef* n, int port) : node(n), port_id(port) {}
     NodeDef* node = nullptr;
     int port_id = -1;
 
@@ -36,8 +38,16 @@ class GraphView {
       return node == other.node && port_id == other.port_id;
     }
   };
-  struct InputPort : public Port {};
-  struct OutputPort : public Port {};
+  struct InputPort : public Port {
+    InputPort() = default;
+    InputPort(NodeDef* n, int port_id) : Port(n, port_id) {}
+    InputPort(const NodeDef* n, int port_id)
+        : Port(const_cast<NodeDef*>(n), port_id) {}
+  };
+  struct OutputPort : public Port {
+    OutputPort() = default;
+    OutputPort(NodeDef* n, int port_id) : Port(n, port_id) {}
+  };
 
   struct HashPort {
     std::size_t operator()(const Port& port) const {
@@ -45,6 +55,20 @@ class GraphView {
     }
   };
 
+  struct Edge {
+    OutputPort src;
+    InputPort tgt;
+
+    bool operator==(const Edge& other) const {
+      return src == other.src && tgt == other.tgt;
+    }
+  };
+  struct HashEdge {
+    std::size_t operator()(const Edge& edge) const {
+      return HashPort()(edge.src) + HashPort()(edge.tgt);
+    }
+  };
+
   explicit GraphView(GraphDef* graph);
   GraphDef* GetGraph() const { return graph_; }
   NodeDef* GetNode(const string& node_name) const;
@@ -63,6 +87,7 @@ class GraphView {
       const OutputPort& port) const;
   std::unordered_set<OutputPort, HashPort> GetFanin(
       const InputPort& port) const;
+
   // Special case: regular (i.e. non-control) input ports can only have one
   // fanin.
   const OutputPort GetRegularFanin(const InputPort& port) const;
@@ -79,6 +104,13 @@ class GraphView {
   // controlling nodes iff include_controlling_nodes is true.
   int NumFanins(const NodeDef& node, bool include_controlling_nodes) const;
 
+  // Get all the edge in the immediate fanout (resp fanin) of a node. Include
+  // the control edges iff include_controlling_edges is true.
+  std::unordered_set<Edge, HashEdge> GetFanoutEdges(
+      const NodeDef& node, bool include_controlled_edges) const;
+  std::unordered_set<Edge, HashEdge> GetFaninEdges(
+      const NodeDef& node, bool include_controlling_edges) const;
+
  private:
   GraphDef* graph_;
   std::unordered_map<string, NodeDef*> nodes_;
-- 
GitLab


From 3624fe7d063f8fa6fe5bd864ced291f520c54cdd Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Tue, 24 Apr 2018 14:42:07 -0700
Subject: [PATCH 382/450] Invalidate the StatCache as well as the
 FileBlockCache, as once the file is overwritten or removed, the stat will
 become outdated.

PiperOrigin-RevId: 194148397
---
 .../core/platform/cloud/expiring_lru_cache.h  | 18 +++++++
 .../platform/cloud/expiring_lru_cache_test.cc | 17 +++++++
 .../core/platform/cloud/gcs_file_system.cc    | 19 ++++---
 .../core/platform/cloud/gcs_file_system.h     |  3 ++
 .../platform/cloud/gcs_file_system_test.cc    | 50 +++++++++++++++++++
 5 files changed, 100 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/platform/cloud/expiring_lru_cache.h b/tensorflow/core/platform/cloud/expiring_lru_cache.h
index c738497ddd..e2d048f141 100644
--- a/tensorflow/core/platform/cloud/expiring_lru_cache.h
+++ b/tensorflow/core/platform/cloud/expiring_lru_cache.h
@@ -51,6 +51,14 @@ class ExpiringLRUCache {
     InsertLocked(key, value);
   }
 
+  // Delete the entry with key `key`. Return true if the entry was found for
+  // `key`, false if the entry was not found. In both cases, there is no entry
+  // with key `key` existed after the call.
+  bool Delete(const string& key) {
+    mutex_lock lock(mu_);
+    return DeleteLocked(key);
+  }
+
   /// Look up the entry with key `key` and copy it to `value` if found. Returns
   /// true if an entry was found for `key`, and its timestamp is not more than
   /// max_age_ seconds in the past.
@@ -141,6 +149,16 @@ class ExpiringLRUCache {
     }
   }
 
+  bool DeleteLocked(const string& key) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    auto it = cache_.find(key);
+    if (it == cache_.end()) {
+      return false;
+    }
+    lru_list_.erase(it->second.lru_iterator);
+    cache_.erase(it);
+    return true;
+  }
+
   /// The maximum age of entries in the cache, in seconds. A value of 0 means
   /// that no entry is ever placed in the cache.
   const uint64 max_age_;
diff --git a/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc b/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc
index 3bc6db3842..42879e80a9 100644
--- a/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc
+++ b/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc
@@ -174,5 +174,22 @@ TEST(ExpiringLRUCacheTest, Clear) {
   EXPECT_FALSE(cache.Lookup("d", &value));
 }
 
+TEST(ExpiringLRUCacheTest, Delete) {
+  // Insert an entry.
+  ExpiringLRUCache<int> cache(1, 4);
+  cache.Insert("a", 1);
+  int value = 0;
+  EXPECT_TRUE(cache.Lookup("a", &value));
+  EXPECT_EQ(value, 1);
+
+  // Delete the entry.
+  EXPECT_TRUE(cache.Delete("a"));
+  EXPECT_FALSE(cache.Lookup("a", &value));
+
+  // Try deleting the entry again.
+  EXPECT_FALSE(cache.Delete("a"));
+  EXPECT_FALSE(cache.Lookup("a", &value));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index f0003fa784..2d9c99c124 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -857,14 +857,20 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
   return Status::OK();
 }
 
+void GcsFileSystem::ClearFileCaches(const string& fname) {
+  file_block_cache_->RemoveFile(fname);
+  stat_cache_->Delete(fname);
+  // TODO(rxsang): Remove the patterns that matche the file in
+  // MatchingPathsCache as well.
+}
+
 Status GcsFileSystem::NewWritableFile(const string& fname,
                                       std::unique_ptr<WritableFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
-  result->reset(new GcsWritableFile(
-      bucket, object, this, &timeouts_,
-      [this, fname]() { file_block_cache_->RemoveFile(fname); },
-      initial_retry_delay_usec_));
+  result->reset(new GcsWritableFile(bucket, object, this, &timeouts_,
+                                    [this, fname]() { ClearFileCaches(fname); },
+                                    initial_retry_delay_usec_));
   return Status::OK();
 }
 
@@ -904,8 +910,7 @@ Status GcsFileSystem::NewAppendableFile(const string& fname,
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
   result->reset(new GcsWritableFile(
       bucket, object, this, old_content_filename, &timeouts_,
-      [this, fname]() { file_block_cache_->RemoveFile(fname); },
-      initial_retry_delay_usec_));
+      [this, fname]() { ClearFileCaches(fname); }, initial_retry_delay_usec_));
   return Status::OK();
 }
 
@@ -1277,7 +1282,7 @@ Status GcsFileSystem::DeleteFile(const string& fname) {
   request->SetDeleteRequest();
 
   TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when deleting ", fname);
-  file_block_cache_->RemoveFile(fname);
+  ClearFileCaches(fname);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index 703c8d5778..99c94c1751 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -227,6 +227,9 @@ class GcsFileSystem : public FileSystem {
   Status LoadBufferFromGCS(const string& filename, size_t offset, size_t n,
                            char* buffer, size_t* bytes_transferred);
 
+  // Clear all the caches related to the file with name `filename`.
+  void ClearFileCaches(const string& fname);
+
   std::unique_ptr<AuthProvider> auth_provider_;
   std::unique_ptr<HttpRequest::Factory> http_request_factory_;
   std::unique_ptr<FileBlockCache> file_block_cache_;
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index ca4b7722b6..c639299954 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -1551,6 +1551,56 @@ TEST(GcsFileSystemTest, DeleteFile_NoObjectName) {
             fs.DeleteFile("gs://bucket/").code());
 }
 
+TEST(GcsFileSystemTest, DeleteFile_StatCacheRemoved) {
+  std::vector<HttpRequest*> requests(
+      {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "file.txt?fields=size%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"1010\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
+                           "/bucket/o/file.txt\n"
+                           "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
+                           "Delete: yes\n",
+                           ""),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "file.txt?fields=size%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           "", errors::NotFound("404"), 404),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
+           "fields=items%2Fname%2CnextPageToken&prefix=file.txt%2F"
+           "&maxResults=1\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           "{}")});
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      16 /* block size */, 16 /* max bytes */, 0 /* max staleness */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
+
+  // Stats the file first so the stat is cached.
+  FileStatistics stat_before_deletion;
+  TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat_before_deletion));
+  EXPECT_EQ(1010, stat_before_deletion.length);
+
+  TF_EXPECT_OK(fs.DeleteFile("gs://bucket/file.txt"));
+
+  FileStatistics stat_after_deletion;
+  EXPECT_EQ(error::Code::NOT_FOUND,
+            fs.Stat("gs://bucket/file.txt", &stat_after_deletion).code());
+}
+
 TEST(GcsFileSystemTest, DeleteDir_Empty) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
-- 
GitLab


From 03005b129691bf6db8cf8c8c5a82be70ac79571c Mon Sep 17 00:00:00 2001
From: Billy Lamberta <blamb@google.com>
Date: Tue, 24 Apr 2018 14:52:38 -0700
Subject: [PATCH 383/450] docs: install_linux, move GPU section below install
 procedures.

---
 tensorflow/docs_src/install/install_linux.md | 198 +++++++++----------
 1 file changed, 98 insertions(+), 100 deletions(-)

diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index fa82ac9c40..c66d50c3cb 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -1,106 +1,25 @@
 # Installing TensorFlow on Ubuntu
 
-This guide explains how to install TensorFlow on Ubuntu. Although these
-instructions might also work on other Linux variants, we have only
-tested (and we only support) these instructions on machines meeting the
-following requirements:
+This guide explains how to install TensorFlow on Ubuntu Linux. While these
+instructions may work on other Linux variants, they are tested and supported with
+the following system requirements:
 
-  * 64-bit desktops or laptops
-  * Ubuntu 16.04 or higher
+* 64-bit desktops or laptops
+* Ubuntu 16.04 or higher
 
 
-## Determine which TensorFlow to install
+## Choose which TensorFlow to install
 
-You must choose one of the following types of TensorFlow to install:
+The following TensorFlow variants are available for installation:
 
-  * **TensorFlow with CPU support only**. If your system does not have a
-    NVIDIA® GPU, you must install this version. Note that this version of
-    TensorFlow is typically much easier to install (typically,
-    in 5 or 10 minutes), so even if you have an NVIDIA GPU, we recommend
-    installing this version first.
-  * **TensorFlow with GPU support**. TensorFlow programs typically run
-    significantly faster on a GPU than on a CPU. Therefore, if your
-    system has a NVIDIA® GPU meeting the prerequisites shown below and you
-    need to run performance-critical applications, you should ultimately
-    install this version.
-
-<a name="NVIDIARequirements"></a>
-### NVIDIA requirements to run TensorFlow with GPU support
-
-If you are installing TensorFlow with GPU support using one of the
-mechanisms described in this guide, then the following NVIDIA software
-must be installed on your system:
-
-  * [CUDA Toolkit 9.0](http://nvidia.com/cuda). For details, see
-    [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
-    Ensure that you append the relevant CUDA pathnames to the
-    `LD_LIBRARY_PATH` environment variable as described in the
-    NVIDIA documentation.
-  * [cuDNN SDK v7](http://developer.nvidia.com/cudnn). For details, see
-    [NVIDIA's documentation](http://docs.nvidia.com/deeplearning/sdk/cudnn-install/).
-    Ensure that you create the `CUDA_HOME` environment variable as
-    described in the NVIDIA documentation.
-  * GPU card with CUDA Compute Capability 3.0 or higher for building
-    from source and 3.5 or higher for our binaries. See
-    [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for
-    a list of supported GPU cards.
-  * [GPU drivers](http://nvidia.com/driver) supporting your version of the CUDA
-    Toolkit.
-  * The libcupti-dev library, which is the NVIDIA CUDA Profile Tools Interface.
-    This library provides advanced profiling support. To install this library,
-    issue the following command for CUDA Toolkit >= 8.0:
-
-    <pre>
-    $ <b>sudo apt-get install cuda-command-line-tools</b>
-    </pre>
-
-    and add its path to your `LD_LIBRARY_PATH` environment variable:
-
-    <pre>
-    $ <b>export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}/usr/local/cuda/extras/CUPTI/lib64</b>
-    </pre>
-
-    For CUDA Toolkit <= 7.5 do:
-
-    <pre>
-    $ <b>sudo apt-get install libcupti-dev</b>
-    </pre>
-
-  * **[OPTIONAL]**  For optimized inferencing performance, you can also install
-    **NVIDIA TensorRT 3.0**. The minimal set of TensorRT runtime components needed
-    for use with the pre-built `tensorflow-gpu` package can be installed as follows:
-
-    <pre>
-    $ <b>wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/nvinfer-runtime-trt-repo-ubuntu1404-3.0.4-ga-cuda9.0_1.0-1_amd64.deb</b>
-    $ <b>sudo dpkg -i nvinfer-runtime-trt-repo-ubuntu1404-3.0.4-ga-cuda9.0_1.0-1_amd64.deb</b>
-    $ <b>sudo apt-get update</b>
-    $ <b>sudo apt-get install -y --allow-downgrades libnvinfer-dev libcudnn7-dev=7.0.5.15-1+cuda9.0 libcudnn7=7.0.5.15-1+cuda9.0</b>
-    </pre>
-
-    **IMPORTANT:** For compatibility with the pre-built `tensorflow-gpu`
-    package, please use the Ubuntu **14.04** package of TensorRT as shown above,
-    even when installing onto an Ubuntu 16.04 system.<br/>
-    <br/>
-    To build the TensorFlow-TensorRT integration module from source rather than
-    using pre-built binaries, see the [module documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/tensorrt#using-tensorrt-in-tensorflow).
-    For detailed TensorRT installation instructions, see [NVIDIA's TensorRT documentation](http://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html).<br/>
-    <br/>
-    To avoid cuDNN version conflicts during later system upgrades, you can hold
-    the cuDNN version at 7.0.5:
-
-    <pre>
-    $ <b> sudo apt-mark hold libcudnn7 libcudnn7-dev</b>
-    </pre>
-
-    To later allow upgrades, you can remove the hold:
-
-    <pre>
-    $ <b> sudo apt-mark unhold libcudnn7 libcudnn7-dev</b>
-    </pre>
-
-If you have an earlier version of the preceding packages, please upgrade to
-the specified versions. If upgrading is not possible, then you may still run
-TensorFlow with GPU support, if you @{$install_sources$install TensorFlow from Sources}.
+* __TensorFlow with CPU support only__. If your system does not have a
+  NVIDIA®&nbsp;GPU, you must install this version. This version of TensorFlow is
+  usually easier to install, so even if you have an NVIDIA GPU, we recommend
+  installing this version first.
+* __TensorFlow with GPU support__. TensorFlow programs usually run much faster on
+  a GPU instead of a CPU. If you run performance-critical applications and your
+  system has an NVIDIA®&nbsp;GPU that meets the prerequisites, you should install
+  this version. See [TensorFlow GPU support](#NVIDIARequirements) for details.
 
 
 ## How to install TensorFlow
@@ -131,8 +50,8 @@ On Ubuntu, Python is automatically installed and `pip` is *usually* installed.
 Confirm the `python` and `pip` versions:
 
 <pre class="prettyprint lang-bsh">
-  <code class="devsite-terminal">python -V</code>
-  <code class="devsite-terminal">pip -V  # or: pip3 -V</code>
+  <code class="devsite-terminal">python -V  # or: python3 -V</code>
+  <code class="devsite-terminal">pip -V     # or: pip3 -V</code>
 </pre>
 
 To install these packages on Ubuntu:
@@ -264,8 +183,8 @@ On Ubuntu, Python is automatically installed and `pip` is *usually* installed.
 Confirm the `python` and `pip` versions:
 
 <pre class="prettyprint lang-bsh">
-  <code class="devsite-terminal">python -V</code>
-  <code class="devsite-terminal">pip -V  # or: pip3 -V</code>
+  <code class="devsite-terminal">python -V  # or: python3 -V</code>
+  <code class="devsite-terminal">pip -V     # or: pip3 -V</code>
 </pre>
 
 To install these packages on Ubuntu:
@@ -578,6 +497,85 @@ If you are new to machine learning, we recommend the following:
 *  @{$get_started/eager}
 
 
+<a name="NVIDIARequirements"></a>
+## TensorFlow GPU support
+
+To install TensorFlow with GPU support, configure the following NVIDIA® software
+on your system:
+
+* [CUDA Toolkit 9.0](http://nvidia.com/cuda). For details, see
+  [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
+  Append the relevant CUDA pathnames to the `LD_LIBRARY_PATH` environmental
+  variable as described in the NVIDIA documentation.
+* [cuDNN SDK v7](http://developer.nvidia.com/cudnn). For details, see
+  [NVIDIA's documentation](http://docs.nvidia.com/deeplearning/sdk/cudnn-install/).
+  Create the `CUDA_HOME` environment variable as described in the NVIDIA
+  documentation.
+* A GPU card with CUDA Compute Capability 3.0 or higher for building TensorFlow
+  from source. To use the TensorFlow binaries, version 3.5 or higher is required.
+  See the [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a
+  list of supported GPU cards.
+* [GPU drivers](http://nvidia.com/driver) that support your version of the CUDA
+  Toolkit.
+* The `libcupti-dev` library is the NVIDIA CUDA Profile Tools Interface. This
+  library provides advanced profiling support. To install this library,
+  use the following command for CUDA Toolkit >= 8.0:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo apt-get install cuda-command-line-tools</code>
+</pre>
+
+Add this path to the `LD_LIBRARY_PATH` environmental variable:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}/usr/local/cuda/extras/CUPTI/lib64</code>
+</pre>
+
+For CUDA Toolkit <= 7.5 use:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo apt-get install libcupti-dev</code>
+</pre>
+
+* *OPTIONAL*:  For optimized performance during inference, install
+  *NVIDIA&nbsp;TensorRT&nbsp;3.0*. To install the minimal amount of TensorRT
+  runtime components required to use with the pre-built `tensorflow-gpu` package:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/nvinfer-runtime-trt-repo-ubuntu1404-3.0.4-ga-cuda9.0_1.0-1_amd64.deb</code>
+  <code class="devsite-terminal">sudo dpkg -i nvinfer-runtime-trt-repo-ubuntu1404-3.0.4-ga-cuda9.0_1.0-1_amd64.deb</code>
+  <code class="devsite-terminal">sudo apt-get update</code>
+  <code class="devsite-terminal">sudo apt-get install -y --allow-downgrades libnvinfer-dev libcudnn7-dev=7.0.5.15-1+cuda9.0 libcudnn7=7.0.5.15-1+cuda9.0</code>
+</pre>
+
+Note: For compatibility with the pre-built `tensorflow-gpu` package, use the
+Ubuntu *14.04* package of TensorRT (shown above). Use this even when installing
+on an Ubuntu 16.04 system.
+
+To build the TensorFlow-TensorRT integration module from source instead of using
+the pre-built binaries, see the
+[module documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/tensorrt#using-tensorrt-in-tensorflow).
+For detailed TensorRT installation instructions, see
+[NVIDIA's TensorRT documentation](http://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html).
+
+To avoid cuDNN version conflicts during later system upgrades, hold the cuDNN
+version at 7.0.5:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo apt-mark hold libcudnn7 libcudnn7-dev</code>
+</pre>
+
+To allow upgrades, remove the this hold:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo apt-mark unhold libcudnn7 libcudnn7-dev</code>
+</pre>
+
+If you have an earlier version of the preceding packages, upgrade to the
+specified versions. If upgrading is not possible, you can still run TensorFlow
+with GPU support by @{$install_sources}.
+
+
 ## Common installation problems
 
 We are relying on Stack Overflow to document TensorFlow installation problems
-- 
GitLab


From 184c8306a4a3d41f42f077b4898933500d61ce86 Mon Sep 17 00:00:00 2001
From: Igor Saprykin <isaprykin@google.com>
Date: Tue, 24 Apr 2018 14:52:59 -0700
Subject: [PATCH 384/450] Add deprecation notice to replicate_model_fn.

PiperOrigin-RevId: 194150426
---
 tensorflow/contrib/estimator/BUILD                         | 1 +
 .../estimator/python/estimator/replicate_model_fn.py       | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index 62ddb3d290..b473de86ee 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -367,6 +367,7 @@ py_library(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/estimator:export_output",
         "//tensorflow/python/estimator:model_fn",
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
index a8774d6dab..f8564446e5 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
@@ -47,8 +47,12 @@ from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training import device_setter as device_setter_lib
 from tensorflow.python.training import optimizer as optimizer_lib
+from tensorflow.python.util import deprecation
 
 
+@deprecation.deprecated(
+    '2018-05-31',
+    'Please use `tf.contrib.distribute.MirroredStrategy` instead.')
 def replicate_model_fn(model_fn,
                        loss_reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
                        devices=None):
@@ -255,6 +259,9 @@ class TowerOptimizer(optimizer_lib.Optimizer):
 
   COLLECTION_FOR_GRAPH_STATES = 'replicate_model_fn_graph_states'
 
+  @deprecation.deprecated(
+      '2018-05-31',
+      'Please use `tf.contrib.distribute.MirroredStrategy` instead.')
   def __init__(self, optimizer_or_optimizer_fn):
     """Wrap an existing optimizer for gathering gradients across towers.
 
-- 
GitLab


From c13af7d5a2bde4cedd28336e688f15d9bc0d886c Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Tue, 24 Apr 2018 14:55:47 -0700
Subject: [PATCH 385/450] Fix a bug where string::substr is used with wrong
 position.

---
 .../contrib/tensorrt/convert/convert_graph.cc       | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index b412b296e0..0774027711 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -111,20 +111,22 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
   }
 }
 
-std::pair<string, int> ParseTensorName(string name, int default_idx = 0) {
+std::pair<string, int> ParseTensorName(const string& name,
+                                       int default_idx = 0) {
+  string name_no_idx = name;
   int idx = default_idx;
-  size_t sep = name.find_last_of(':');
+  const size_t sep = name_no_idx.find_last_of(':');
   if (sep != string::npos) {
-    name = name.substr(0, sep);
+    name_no_idx = name_no_idx.substr(0, sep);
     idx = std::stoi(name.substr(sep + 1));
   }
-  return std::make_pair(name, idx);
+  return std::make_pair(name_no_idx, idx);
 }
 
 std::unordered_map<string, std::vector<int>> BuildTensorNameMap(
     const std::vector<string>& tensor_names) {
   std::unordered_map<string, std::vector<int>> result;
-  for (string const& tensor_name : tensor_names) {
+  for (const string& tensor_name : tensor_names) {
     string node_name;
     int index;
     std::tie(node_name, index) = ParseTensorName(tensor_name);
@@ -132,6 +134,7 @@ std::unordered_map<string, std::vector<int>> BuildTensorNameMap(
   }
   return result;
 }
+
 // TODO(sami): convert references to pointers
 struct ConvertGraphParams {
   ConvertGraphParams(
-- 
GitLab


From e7db82f821a1c522eed9e0c633df8b3db26ef38d Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 24 Apr 2018 15:45:50 -0700
Subject: [PATCH 386/450] Make TF functions work with _USE_C_SHAPES=True.

It turns out regular functions need to manually copy handle data in
addition to eager GraphModeFunctions, so I moved the C extensions to
python_api.h from eager/c_api.h.

This also cleans up function_test.py to assume the C API is enabled.

PiperOrigin-RevId: 194158700
---
 tensorflow/c/eager/BUILD                      |  2 -
 tensorflow/c/eager/c_api.cc                   | 57 -------------------
 tensorflow/c/eager/c_api.h                    | 14 -----
 tensorflow/c/python_api.cc                    | 28 ++++++++-
 tensorflow/c/python_api.h                     | 12 +++-
 tensorflow/python/client/tf_session.i         |  2 +-
 tensorflow/python/eager/function.py           |  2 +-
 tensorflow/python/framework/function.py       | 10 +++-
 tensorflow/python/framework/function_test.py  | 37 +++---------
 tensorflow/python/framework/ops.py            |  4 +-
 .../python/ops/resource_variable_ops.py       |  9 +--
 tensorflow/python/pywrap_tfe.i                |  2 -
 12 files changed, 59 insertions(+), 120 deletions(-)

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index fae922ea3b..1432119162 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -40,8 +40,6 @@ tf_cuda_library(
             "//tensorflow/core:lib",
             "//tensorflow/core:lib_internal",
             "//tensorflow/core:protos_all_cc",
-            # TODO(b/74620627): move this here
-            "//tensorflow/python:cpp_shape_inference_proto_cc",
         ],
     }) + select({
         "//tensorflow:with_xla_support": [
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 975bde7c7f..3bf071f3ab 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -48,7 +48,6 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/public/version.h"
-#include "tensorflow/python/framework/cpp_shape_inference.pb.h"
 
 using tensorflow::int64;
 using tensorflow::string;
@@ -503,62 +502,6 @@ void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf,
   ctx->context.RunMetadataProto()->Clear();
 }
 
-void TFE_GetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output,
-                                       TF_Buffer* output_proto,
-                                       TF_Status* status) {
-  tensorflow::Node* node = &output.oper->node;
-  tensorflow::CppShapeInferenceResult::HandleData handle_data;
-  handle_data.set_is_set(true);
-  {
-    tensorflow::mutex_lock l(graph->mu);
-    tensorflow::shape_inference::InferenceContext* ic =
-        graph->refiner.GetContext(node);
-    CHECK(ic != nullptr);
-    CHECK_LT(output.index, ic->num_outputs());
-    const auto* shapes_and_types =
-        ic->output_handle_shapes_and_types(output.index);
-    if (shapes_and_types == nullptr) {
-      output_proto->data = nullptr;
-      output_proto->length = 0;
-      output_proto->data_deallocator = nullptr;
-      return;
-    }
-
-    for (const auto& p : *shapes_and_types) {
-      auto* out_shape_and_type = handle_data.add_shape_and_type();
-      ic->ShapeHandleToProto(p.shape, out_shape_and_type->mutable_shape());
-      out_shape_and_type->set_dtype(p.dtype);
-    }
-  }
-  status->status = MessageToBuffer(handle_data, output_proto);
-}
-
-void TFE_SetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output,
-                                       const void* proto, size_t proto_len,
-                                       TF_Status* status) {
-  tensorflow::CppShapeInferenceResult::HandleData handle_data;
-  if (!handle_data.ParseFromArray(proto, proto_len)) {
-    status->status = tensorflow::errors::InvalidArgument(
-        "Couldn't deserialize HandleData proto");
-    return;
-  }
-  DCHECK(handle_data.is_set());
-
-  tensorflow::mutex_lock l(graph->mu);
-  tensorflow::shape_inference::InferenceContext* ic =
-      graph->refiner.GetContext(&output.oper->node);
-
-  std::vector<tensorflow::shape_inference::ShapeAndType> shapes_and_types;
-  for (const auto& shape_and_type_proto : handle_data.shape_and_type()) {
-    tensorflow::shape_inference::ShapeHandle shape;
-    status->status =
-        ic->MakeShapeFromShapeProto(shape_and_type_proto.shape(), &shape);
-    if (status->status.ok()) return;
-    shapes_and_types.emplace_back(shape, shape_and_type_proto.dtype());
-  }
-  ic->set_output_handle_shapes_and_types(output.index, shapes_and_types);
-}
-
 namespace {
 TFE_Op* GetFunc(TFE_Context* ctx, const tensorflow::NameAttrList& func,
                 TF_Status* status) {
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index ba77f3cd07..c06ce84a8c 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -329,20 +329,6 @@ TF_CAPI_EXPORT extern void TFE_ContextExportRunMetadata(TFE_Context* ctx,
                                                         TF_Buffer* buf,
                                                         TF_Status* status);
 
-// Returns the serialized CppShapeInferenceResult::HandleData proto for
-// `output` if its a resource tensor, or otherwise returns an empty buffer.
-TF_CAPI_EXPORT extern void TFE_GetResourceHandleShapeAndType(
-    TF_Graph* graph, TF_Output output, TF_Buffer* output_proto,
-    TF_Status* status);
-
-// Sets `output` based on `proto`, which should be a serialized
-// CppShapeInferenceResult::HandleData proto.
-TF_CAPI_EXPORT extern void TFE_SetResourceHandleShapeAndType(TF_Graph* graph,
-                                                             TF_Output output,
-                                                             const void* proto,
-                                                             size_t proto_len,
-                                                             TF_Status* status);
-
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc
index 93155998b8..e18fdf6c57 100644
--- a/tensorflow/c/python_api.cc
+++ b/tensorflow/c/python_api.cc
@@ -110,7 +110,7 @@ void ExtendSession(TF_Session* session, TF_Status* status) {
   session->extend_before_run = false;
 }
 
-std::string ResourceHandleShapeAndType(TF_Graph* graph, TF_Output output) {
+std::string GetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output) {
   Node* node = &output.oper->node;
   CppShapeInferenceResult::HandleData handle_data;
   handle_data.set_is_set(true);
@@ -135,4 +135,30 @@ std::string ResourceHandleShapeAndType(TF_Graph* graph, TF_Output output) {
   return result;
 }
 
+void SetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output,
+                                   const void* proto, size_t proto_len,
+                                   TF_Status* status) {
+  tensorflow::CppShapeInferenceResult::HandleData handle_data;
+  if (!handle_data.ParseFromArray(proto, proto_len)) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Couldn't deserialize HandleData proto");
+    return;
+  }
+  DCHECK(handle_data.is_set());
+
+  tensorflow::mutex_lock l(graph->mu);
+  tensorflow::shape_inference::InferenceContext* ic =
+      graph->refiner.GetContext(&output.oper->node);
+
+  std::vector<tensorflow::shape_inference::ShapeAndType> shapes_and_types;
+  for (const auto& shape_and_type_proto : handle_data.shape_and_type()) {
+    tensorflow::shape_inference::ShapeHandle shape;
+    status->status =
+        ic->MakeShapeFromShapeProto(shape_and_type_proto.shape(), &shape);
+    if (status->status.ok()) return;
+    shapes_and_types.emplace_back(shape, shape_and_type_proto.dtype());
+  }
+  ic->set_output_handle_shapes_and_types(output.index, shapes_and_types);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/c/python_api.h b/tensorflow/c/python_api.h
index 2d4c8cd9ed..4bcb5bde62 100644
--- a/tensorflow/c/python_api.h
+++ b/tensorflow/c/python_api.h
@@ -55,9 +55,15 @@ void ExtendSession(TF_Session* session, TF_Status* status);
 
 // Returns the serialized CppShapeInferenceResult::HandleData proto for
 // `output` if its a resource tensor, or otherwise returns the empty string.
-// TODO(b/74620627): remove when _USE_C_SHAPES is removed
-std::string ResourceHandleShapeAndType(TF_Graph* graph, TF_Output output);
-
+std::string GetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output);
+
+// Sets `output` based on `proto`, which should be a serialized
+// CppShapeInferenceResult::HandleData proto.
+// NOTE(skyewm): `proto` is passed a void*/size_t pair instead of a std::string
+// because I couldn't get SWIG to work otherwise.
+void SetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output,
+                                   const void* proto, size_t proto_len,
+                                   TF_Status* status);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_PYTHON_API_H_
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index b82182d5d3..1db1432d65 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -458,7 +458,7 @@ TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper{
 }
 
 // Override default py3 behavior of attempting to encode into Unicode.
-%typemap(out) std::string tensorflow::ResourceHandleShapeAndType {
+%typemap(out) std::string tensorflow::GetResourceHandleShapeAndType {
   $result = PyBytes_FromStringAndSize($1.data(), $1.size());
 }
 
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index b924448abe..bdbbe864df 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -80,7 +80,7 @@ def capture_value(tensor_map, value, dtype, name):
       if handle_data is not None and handle_data.is_set:
         # pylint: disable=protected-access
         if ops._USE_C_SHAPES:
-          pywrap_tensorflow.TFE_SetResourceHandleShapeAndType(
+          pywrap_tensorflow.SetResourceHandleShapeAndType(
               captured_value.graph._c_graph, captured_value._as_tf_output(),
               handle_data.SerializeToString())
         else:
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 9570f009a5..f343edc483 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -703,7 +703,15 @@ class _FuncGraph(ops.Graph):
     with ops.control_dependencies(None):
       ph = array_ops.placeholder(tensor.dtype, shape=tensor.get_shape())
     # pylint: disable=protected-access
-    ph._handle_data = tensor._handle_data
+    if ops._USE_C_SHAPES:
+      handle_data = c_api.GetResourceHandleShapeAndType(tensor.graph._c_graph,
+                                                        tensor._as_tf_output())
+      if handle_data:
+        c_api.SetResourceHandleShapeAndType(ph.graph._c_graph,
+                                            ph._as_tf_output(),
+                                            compat.as_bytes(handle_data))
+    else:
+      ph._handle_data = tensor._handle_data
     # pylint: enable=protected-access
     self._captured[tensor] = ph
     self.extra_args.append(ph)
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index d6bc14fbc7..cfdacee54f 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -85,7 +85,7 @@ def _OptimizerOptions():
         yield cfg
 
 
-@test_util.with_c_api
+@test_util.with_c_shapes
 class FunctionTest(test.TestCase):
   """Test methods for verifying Function support.
 
@@ -431,7 +431,6 @@ class FunctionTest(test.TestCase):
                                    "assertion failed.*-3"):
         self.assertAllEqual(Foo(constant_op.constant(-3.0)).eval(), 6.0)
 
-  @test_util.disable_c_api   # Op._add_control_inputs doesn't work with C API
   def testAssertWrapper(self):
 
     @function.Defun(dtypes.float32)
@@ -446,7 +445,6 @@ class FunctionTest(test.TestCase):
                                    "assertion"):
         _ = MyFn(100.0).eval()
 
-  @test_util.disable_c_api   # Op._add_control_inputs doesn't work with C API
   def testWhileLoopCallsFunc(self):
     with self.test_session(use_gpu=True) as sess:
 
@@ -466,7 +464,6 @@ class FunctionTest(test.TestCase):
       ans = sess.run(loop)
       self.assertAllClose(ans, 131072.)
 
-  @test_util.disable_c_api   # Op._add_control_inputs doesn't work with C API
   def testControlFlowStrictness(self):
     """Inlined functions must not execute in a untaken control flow branch."""
 
@@ -1054,7 +1051,7 @@ class FunctionTest(test.TestCase):
         self.assertEqual((42.0, 44.0), sess.run((f_0, f_1)))
 
 
-@test_util.with_c_api
+@test_util.with_c_shapes
 class FunctionsFromProtos(test.TestCase):
 
   def expectFunctionsEqual(self, func, grad_func=None, new_func=None):
@@ -1256,7 +1253,7 @@ class FunctionsFromProtos(test.TestCase):
         FunctionWithAttr.definition.attr["experimental_tag"].s, b"tag_value")
 
 
-@test_util.with_c_api
+@test_util.with_c_shapes
 class FunctionOverloadTest(test.TestCase):
 
   def testBasic(self):
@@ -1309,7 +1306,7 @@ class FunctionOverloadTest(test.TestCase):
                      "Successor of x.")
 
 
-@test_util.with_c_api
+@test_util.with_c_shapes
 class FunctionCaptureByValueTest(test.TestCase):
 
   def testCaptureByValue(self):
@@ -1339,7 +1336,7 @@ class FunctionCaptureByValueTest(test.TestCase):
       self.assertAllEqual(y.eval(), [[12.0]])
 
 
-@test_util.with_c_api
+@test_util.with_c_shapes
 class UnrollLSTMTest(test.TestCase):
   BATCH_SIZE = 16
   LSTM_DIMS = 32
@@ -1475,7 +1472,7 @@ class UnrollLSTMTest(test.TestCase):
       self.assertAllClose(d0, d3, rtol=1e-4, atol=1e-4)
 
 
-@test_util.with_c_api
+@test_util.with_c_shapes
 class FunctionInlineControlTest(test.TestCase):
 
   def testFoo(self):
@@ -1543,10 +1540,6 @@ def Linear2(w1, b1, w2, b2, x):
   return Linear(w2, b2, Linear(w1, b1, x))
 
 
-# Set C API before defining module level functions
-ops._USE_C_API = True
-
-
 @function.Defun(*[dtypes.float32] * 3)
 def LinearWithCApi(w, b, x):
   return nn_ops.relu(math_ops.matmul(x, w) + b)
@@ -1557,25 +1550,9 @@ def Linear2WithCApi(w1, b1, w2, b2, x):
   return LinearWithCApi(w2, b2, LinearWithCApi(w1, b1, x))
 
 
-# Unset C API after defining module level functions
-ops._USE_C_API = False
-
-
 class ModuleFunctionTest(test.TestCase):
 
   def testBasic(self):
-    with ops.Graph().as_default():
-      a, b, c, d, e = [
-          constant_op.constant([[_]], dtype=dtypes.float32) for _ in range(5)
-      ]
-      y = Linear(a, b, c)
-      z = Linear2(a, b, c, d, e)
-      with session.Session() as sess:
-        self.assertAllEqual([[1]], sess.run(y))
-        self.assertAllEqual([[5]], sess.run(z))
-
-  @test_util.enable_c_api
-  def testBasicWithCApi(self):
     with ops.Graph().as_default():
       a, b, c, d, e = [
           constant_op.constant([[_]], dtype=dtypes.float32) for _ in range(5)
@@ -1587,7 +1564,7 @@ class ModuleFunctionTest(test.TestCase):
         self.assertAllEqual([[5]], sess.run(z))
 
 
-@test_util.with_c_api
+@test_util.with_c_shapes
 class VariableHoistingTest(test.TestCase):
 
   def _testSimpleModel(self, use_forward_func, use_resource=False):
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 8cd6820f6a..16a8c575c6 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -2557,8 +2557,8 @@ def _set_shape_and_handle_data_for_outputs_c_api(op):
     output._shape_val = output._c_api_shape()
     # Set the resource handle data for compatibility with the Python shape
     # inference code.
-    serialized = c_api.ResourceHandleShapeAndType(
-        op._graph._c_graph, output._as_tf_output())
+    serialized = c_api.GetResourceHandleShapeAndType(op._graph._c_graph,
+                                                     output._as_tf_output())
     if serialized:
       output._handle_data = (
           cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 4d26b2f46e..1e953f658f 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -24,7 +24,6 @@ from tensorflow.core.framework import variable_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
-from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -47,13 +46,11 @@ def get_resource_handle_data(graph_op):
   assert ops._USE_C_SHAPES  # pylint: disable=protected-access
   assert type(graph_op) == ops.Tensor  # pylint: disable=unidiomatic-typecheck
 
-  with c_api_util.tf_buffer() as buf:
-    pywrap_tensorflow.TFE_GetResourceHandleShapeAndType(
-        graph_op.graph._c_graph, graph_op._as_tf_output(), buf)  # pylint: disable=protected-access
-    data = pywrap_tensorflow.TF_GetBuffer(buf)
+  handle_data = pywrap_tensorflow.GetResourceHandleShapeAndType(
+      graph_op.graph._c_graph, graph_op._as_tf_output())  # pylint: disable=protected-access
 
   return cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData.FromString(
-      compat.as_bytes(data))
+      compat.as_bytes(handle_data))
 
 
 def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index 0982a67dee..5ee55301df 100644
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -59,8 +59,6 @@ limitations under the License.
 %rename("%s") TFE_ContextOptionsSetAsync;
 %rename("%s") TFE_DeleteContextOptions;
 %rename("%s") TFE_Py_TensorShapeSlice;
-%rename("%s") TFE_GetResourceHandleShapeAndType;
-%rename("%s") TFE_SetResourceHandleShapeAndType;
 
 %{
 #include "tensorflow/python/eager/pywrap_tfe.h"
-- 
GitLab


From d85610e5d25b4a9150446841d659a17ae1673ddd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 15:49:53 -0700
Subject: [PATCH 387/450] Fix flaky timeouts in metric_ops_test by sharding
 more.

PiperOrigin-RevId: 194159328
---
 tensorflow/contrib/metrics/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/metrics/BUILD b/tensorflow/contrib/metrics/BUILD
index 5ca42f41c1..e050f3c8d4 100644
--- a/tensorflow/contrib/metrics/BUILD
+++ b/tensorflow/contrib/metrics/BUILD
@@ -77,7 +77,7 @@ py_test(
 py_test(
     name = "metric_ops_test",
     srcs = ["python/ops/metric_ops_test.py"],
-    shard_count = 3,
+    shard_count = 8,
     srcs_version = "PY2AND3",
     tags = ["noasan"],  # times out b/63678675
     deps = [
-- 
GitLab


From 29b23ba7afe79035eacf04886aa2636a093f12fb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 15:50:49 -0700
Subject: [PATCH 388/450] Add support for tensors to numpy array related
 assertion methods in test_util.TensorflowTestCase.

PiperOrigin-RevId: 194159512
---
 tensorflow/python/framework/test_util.py      | 209 +++++++++++++++++-
 tensorflow/python/framework/test_util_test.py | 193 ++++++++++++++++
 2 files changed, 395 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 5a8bc43727..dc56d88066 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import contextlib
 import gc
+import itertools
 import math
 import random
 import re
@@ -1212,8 +1213,14 @@ class TensorFlowTestCase(googletest.TestCase):
     self.assertTrue(self._NDArrayNear(ndarray1, ndarray2, err), msg=msg)
 
   def _GetNdArray(self, a):
+    # If a is a tensor then convert it to ndarray
+    if isinstance(a, ops.Tensor):
+      if isinstance(a, ops._EagerTensorBase):
+        return a.numpy()
+      else:
+        a = self.evaluate(a)
     if not isinstance(a, np.ndarray):
-      a = np.array(a)
+      return np.array(a)
     return a
 
   def _assertArrayLikeAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
@@ -1286,8 +1293,8 @@ class TensorFlowTestCase(googletest.TestCase):
       # Try to directly compare a, b as ndarrays; if not work, then traverse
       # through the sequence, which is more expensive.
       try:
-        a_as_ndarray = np.array(a)
-        b_as_ndarray = np.array(b)
+        a_as_ndarray = self._GetNdArray(a)
+        b_as_ndarray = self._GetNdArray(b)
         self._assertArrayLikeAllClose(
             a_as_ndarray,
             b_as_ndarray,
@@ -1322,16 +1329,18 @@ class TensorFlowTestCase(googletest.TestCase):
         raise
 
   def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
-    """Asserts that two structures of numpy arrays, have near values.
+    """Asserts that two structures of numpy arrays or Tensors, have near values.
 
     `a` and `b` can be arbitrarily nested structures. A layer of a nested
     structure can be a `dict`, `namedtuple`, `tuple` or `list`.
 
     Args:
       a: The expected numpy `ndarray`, or anything that can be converted into a
-          numpy `ndarray`, or any arbitrarily nested of structure of these.
+         numpy `ndarray` (including Tensor), or any arbitrarily nested of
+         structure of these.
       b: The actual numpy `ndarray`, or anything that can be converted into a
-          numpy `ndarray`, or any arbitrarily nested of structure of these.
+         numpy `ndarray` (including Tensor), or any arbitrarily nested of
+         structure of these.
       rtol: relative tolerance.
       atol: absolute tolerance.
       msg: Optional message to report on failure.
@@ -1391,8 +1400,26 @@ class TensorFlowTestCase(googletest.TestCase):
 
     self.assertAllClose(a, b, rtol=rtol, atol=atol, msg=msg)
 
+  def assertNotAllClose(self, a, b, **kwargs):
+    """Assert that two numpy arrays, or or Tensors, do not have near values.
+
+    Args:
+      a: the first value to compare.
+      b: the second value to compare.
+      **kwargs: additional keyword arguments to be passed to the underlying
+        `assertAllClose` call.
+
+    Raises:
+      AssertionError: If `a` and `b` are unexpectedly close at all elements.
+    """
+    try:
+      self.assertAllClose(a, b, **kwargs)
+    except AssertionError:
+      return
+    raise AssertionError("The two values are close at all elements")
+
   def assertAllEqual(self, a, b, msg=None):
-    """Asserts that two numpy arrays have the same values.
+    """Asserts that two numpy arrays or Tensors have the same values.
 
     Args:
       a: the expected numpy ndarray or anything can be converted to one.
@@ -1424,6 +1451,174 @@ class TensorFlowTestCase(googletest.TestCase):
       print("not equal rhs = ", y)
       np.testing.assert_array_equal(a, b, err_msg=msg)
 
+  def assertAllGreater(self, a, comparison_target):
+    """Assert element values are all greater than a target value.
+
+    Args:
+      a: The numpy `ndarray`, or anything that can be converted into a
+         numpy `ndarray` (including Tensor).
+      comparison_target: The target value of comparison.
+    """
+    a = self._GetNdArray(a)
+    self.assertGreater(np.min(a), comparison_target)
+
+  def assertAllLess(self, a, comparison_target):
+    """Assert element values are all greater than a target value.
+
+    Args:
+      a: The numpy `ndarray`, or anything that can be converted into a
+         numpy `ndarray` (including Tensor).
+      comparison_target: The target value of comparison.
+    """
+    a = self._GetNdArray(a)
+    self.assertLess(np.max(a), comparison_target)
+
+  def assertAllGreaterEqual(self, a, comparison_target):
+    """Assert element values are all greater than a target value.
+
+    Args:
+      a: The numpy `ndarray`, or anything that can be converted into a
+         numpy `ndarray` (including Tensor).
+      comparison_target: The target value of comparison.
+    """
+    a = self._GetNdArray(a)
+    self.assertGreaterEqual(np.min(a), comparison_target)
+
+  def assertAllLessEqual(self, a, comparison_target):
+    """Assert element values are all greater than a target value.
+
+    Args:
+      a: The numpy `ndarray`, or anything that can be converted into a
+         numpy `ndarray` (including Tensor).
+      comparison_target: The target value of comparison.
+    """
+    a = self._GetNdArray(a)
+    self.assertLessEqual(np.max(a), comparison_target)
+
+  def _format_subscripts(self, subscripts, value, limit=10, indent=2):
+    """Generate a summary of ndarray subscripts as a list of str.
+
+    If limit == N, this method will print up to the first N subscripts on
+    separate
+    lines. A line of ellipses (...) will be appended at the end if the number of
+    subscripts exceeds N.
+
+    Args:
+      subscripts: The tensor (np.ndarray) subscripts, of the same format as
+        np.where()'s return value, i.e., a tuple of arrays with each array
+        corresponding to a dimension. E.g., (array([1, 1]), array([0, 1])).
+      value: (np.ndarray) value of the tensor.
+      limit: (int) The maximum number of indices to print.
+      indent: (int) Number of characters to indent at the beginning of each
+        line.
+
+    Returns:
+      (list of str) the multi-line representation of the subscripts and values,
+        potentially with omission at the end.
+    """
+    lines = []
+    subscripts = np.transpose(subscripts)
+    prefix = " " * indent
+    for subscript in itertools.islice(subscripts, limit):
+      lines.append(prefix + str(subscript) + " : " +
+                   str(value[tuple(subscript)]))
+    if len(subscripts) > limit:
+      lines.append(prefix + "...")
+    return lines
+
+  def assertAllInRange(self,
+                       target,
+                       lower_bound,
+                       upper_bound,
+                       open_lower_bound=False,
+                       open_upper_bound=False):
+    """Assert that elements in a Tensor are all in a given range.
+
+    Args:
+      target: The numpy `ndarray`, or anything that can be converted into a
+         numpy `ndarray` (including Tensor).
+      lower_bound: lower bound of the range
+      upper_bound: upper bound of the range
+      open_lower_bound: (`bool`) whether the lower bound is open (i.e., > rather
+        than the default >=)
+      open_upper_bound: (`bool`) whether the upper bound is open (i.e., < rather
+        than the default <=)
+
+    Raises:
+      AssertionError:
+        if the value tensor does not have an ordered numeric type (float* or
+          int*), or
+        if there are nan values, or
+        if any of the elements do not fall in the specified range.
+    """
+    target = self._GetNdArray(target)
+    if not (np.issubdtype(target.dtype, np.float) or
+            np.issubdtype(target.dtype, np.integer)):
+      raise AssertionError(
+          "The value of %s does not have an ordered numeric type, instead it "
+          "has type: %s" % (target, target.dtype))
+
+    nan_subscripts = np.where(np.isnan(target))
+    if np.size(nan_subscripts):
+      raise AssertionError(
+          "%d of the %d element(s) are NaN. "
+          "Subscripts(s) and value(s) of the NaN element(s):\n" %
+          (len(nan_subscripts[0]), np.size(target)) +
+          "\n".join(self._format_subscripts(nan_subscripts, target)))
+
+    range_str = (("(" if open_lower_bound else "[") + str(lower_bound) + ", " +
+                 str(upper_bound) + (")" if open_upper_bound else "]"))
+
+    violations = (
+        np.less_equal(target, lower_bound)
+        if open_lower_bound else np.less(target, lower_bound))
+    violations = np.logical_or(
+        violations,
+        np.greater_equal(target, upper_bound)
+        if open_upper_bound else np.greater(target, upper_bound))
+    violation_subscripts = np.where(violations)
+    if np.size(violation_subscripts):
+      raise AssertionError(
+          "%d of the %d element(s) are outside the range %s. " %
+          (len(violation_subscripts[0]), np.size(target), range_str) +
+          "Subscript(s) and value(s) of the offending elements:\n" +
+          "\n".join(self._format_subscripts(violation_subscripts, target)))
+
+  def assertAllInSet(self, target, expected_set):
+    """Assert that elements of a Tensor are all in a given closed set.
+
+    Args:
+      target: The numpy `ndarray`, or anything that can be converted into a
+         numpy `ndarray` (including Tensor).
+      expected_set: (`list`, `tuple` or `set`) The closed set that the elements
+        of the value of `target` are expected to fall into.
+
+    Raises:
+      AssertionError:
+        if any of the elements do not fall into `expected_set`.
+    """
+    target = self._GetNdArray(target)
+
+    # Elements in target that are not in expected_set.
+    diff = np.setdiff1d(target.flatten(), list(expected_set))
+    if np.size(diff):
+      raise AssertionError("%d unique element(s) are not in the set %s: %s" %
+                           (np.size(diff), expected_set, diff))
+
+  def assertDTypeEqual(self, target, expected_dtype):
+    """Assert ndarray data type is equal to expected.
+
+    Args:
+      target: The numpy `ndarray`, or anything that can be converted into a
+         numpy `ndarray` (including Tensor).
+      expected_dtype: Expected data type.
+    """
+    target = self._GetNdArray(target)
+    if not isinstance(target, list):
+      arrays = [target]
+    for arr in arrays:
+      self.assertEqual(arr.dtype, expected_dtype)
+
   # pylint: disable=g-doc-return-or-yield
   @contextlib.contextmanager
   def assertRaisesWithPredicateMatch(self, exception_type,
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 02ffa93bae..8d492256aa 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -31,13 +31,16 @@ from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_ops  # pylint: disable=unused-import
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
@@ -209,6 +212,21 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     self._WeMustGoDeeper("name")
     self._WeMustGoDeeper("orig")
 
+  def testAllCloseTensors(self):
+    a_raw_data = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
+    a = constant_op.constant(a_raw_data)
+    b = math_ops.add(1, constant_op.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]]))
+    self.assertAllClose(a, b)
+    self.assertAllClose(a, a_raw_data)
+
+    a_dict = {"key": a}
+    b_dict = {"key": b}
+    self.assertAllClose(a_dict, b_dict)
+
+    x_list = [a, b]
+    y_list = [a_raw_data, b]
+    self.assertAllClose(x_list, y_list)
+
   def testAllCloseScalars(self):
     self.assertAllClose(7, 7 + 1e-8)
     with self.assertRaisesRegexp(AssertionError, r"Not equal to tolerance"):
@@ -317,6 +335,12 @@ class TestUtilTest(test_util.TensorFlowTestCase):
         rtol=1e-8, atol=1e-8
     )
 
+    self.assertAllCloseAccordingToType(
+        constant_op.constant([1e-8], dtype=dtypes.float64),
+        constant_op.constant([2e-8], dtype=dtypes.float64),
+        rtol=1e-8,
+        atol=1e-8)
+
     with (self.assertRaises(AssertionError)):
       self.assertAllCloseAccordingToType(
           np.asarray([1e-7], dtype=np.float64),
@@ -332,6 +356,14 @@ class TestUtilTest(test_util.TensorFlowTestCase):
         float_rtol=1e-7, float_atol=1e-7
     )
 
+    self.assertAllCloseAccordingToType(
+        constant_op.constant([1e-7], dtype=dtypes.float32),
+        constant_op.constant([2e-7], dtype=dtypes.float32),
+        rtol=1e-8,
+        atol=1e-8,
+        float_rtol=1e-7,
+        float_atol=1e-7)
+
     with (self.assertRaises(AssertionError)):
       self.assertAllCloseAccordingToType(
           np.asarray([1e-6], dtype=np.float32),
@@ -349,6 +381,16 @@ class TestUtilTest(test_util.TensorFlowTestCase):
         half_rtol=1e-4, half_atol=1e-4
     )
 
+    self.assertAllCloseAccordingToType(
+        constant_op.constant([1e-4], dtype=dtypes.float16),
+        constant_op.constant([2e-4], dtype=dtypes.float16),
+        rtol=1e-8,
+        atol=1e-8,
+        float_rtol=1e-7,
+        float_atol=1e-7,
+        half_rtol=1e-4,
+        half_atol=1e-4)
+
     with (self.assertRaises(AssertionError)):
       self.assertAllCloseAccordingToType(
           np.asarray([1e-3], dtype=np.float16),
@@ -358,6 +400,157 @@ class TestUtilTest(test_util.TensorFlowTestCase):
           half_rtol=1e-4, half_atol=1e-4
       )
 
+  def testAssertAllEqual(self):
+    i = variables.Variable([100] * 3, dtype=dtypes.int32, name="i")
+    j = constant_op.constant([20] * 3, dtype=dtypes.int32, name="j")
+    k = math_ops.add(i, j, name="k")
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual([120] * 3, k)
+    self.assertAllEqual([20] * 3, j)
+
+  def testAssertNotAllClose(self):
+    # Test with arrays
+    self.assertNotAllClose([0.1], [0.2])
+    with self.assertRaises(AssertionError):
+      self.assertNotAllClose([-1.0, 2.0], [-1.0, 2.0])
+
+    # Test with tensors
+    x = constant_op.constant([1.0, 1.0], name="x")
+    y = math_ops.add(x, x)
+
+    self.assertAllClose([2.0, 2.0], y)
+    self.assertNotAllClose([0.9, 1.0], x)
+
+    with self.assertRaises(AssertionError):
+      self.assertNotAllClose([1.0, 1.0], x)
+
+  def testAssertNotAllCloseRTol(self):
+    # Test with arrays
+    with self.assertRaises(AssertionError):
+      self.assertNotAllClose([1.1, 2.1], [1.0, 2.0], rtol=0.2)
+
+    # Test with tensors
+    x = constant_op.constant([1.0, 1.0], name="x")
+    y = math_ops.add(x, x)
+
+    self.assertAllClose([2.0, 2.0], y)
+
+    with self.assertRaises(AssertionError):
+      self.assertNotAllClose([0.9, 1.0], x, rtol=0.2)
+
+  def testAssertNotAllCloseATol(self):
+    # Test with arrays
+    with self.assertRaises(AssertionError):
+      self.assertNotAllClose([1.1, 2.1], [1.0, 2.0], atol=0.2)
+
+    # Test with tensors
+    x = constant_op.constant([1.0, 1.0], name="x")
+    y = math_ops.add(x, x)
+
+    self.assertAllClose([2.0, 2.0], y)
+
+    with self.assertRaises(AssertionError):
+      self.assertNotAllClose([0.9, 1.0], x, atol=0.2)
+
+  def testAssertAllGreaterLess(self):
+    x = constant_op.constant([100.0, 110.0, 120.0], dtype=dtypes.float32)
+    y = constant_op.constant([10.0] * 3, dtype=dtypes.float32)
+    z = math_ops.add(x, y)
+
+    self.assertAllClose([110.0, 120.0, 130.0], z)
+
+    self.assertAllGreater(x, 95.0)
+    self.assertAllLess(x, 125.0)
+
+    with self.assertRaises(AssertionError):
+      self.assertAllGreater(x, 105.0)
+    with self.assertRaises(AssertionError):
+      self.assertAllGreater(x, 125.0)
+
+    with self.assertRaises(AssertionError):
+      self.assertAllLess(x, 115.0)
+    with self.assertRaises(AssertionError):
+      self.assertAllLess(x, 95.0)
+
+  def testAssertAllGreaterLessEqual(self):
+    x = constant_op.constant([100.0, 110.0, 120.0], dtype=dtypes.float32)
+    y = constant_op.constant([10.0] * 3, dtype=dtypes.float32)
+    z = math_ops.add(x, y)
+
+    self.assertAllEqual([110.0, 120.0, 130.0], z)
+
+    self.assertAllGreaterEqual(x, 95.0)
+    self.assertAllLessEqual(x, 125.0)
+
+    with self.assertRaises(AssertionError):
+      self.assertAllGreaterEqual(x, 105.0)
+    with self.assertRaises(AssertionError):
+      self.assertAllGreaterEqual(x, 125.0)
+
+    with self.assertRaises(AssertionError):
+      self.assertAllLessEqual(x, 115.0)
+    with self.assertRaises(AssertionError):
+      self.assertAllLessEqual(x, 95.0)
+
+  def testAssertAllInRangeWithNonNumericValuesFails(self):
+    s1 = constant_op.constant("Hello, ", name="s1")
+    c = constant_op.constant([1 + 2j, -3 + 5j], name="c")
+    b = constant_op.constant([False, True], name="b")
+
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(s1, 0.0, 1.0)
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(c, 0.0, 1.0)
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(b, 0, 1)
+
+  def testAssertAllInRange(self):
+    x = constant_op.constant([10.0, 15.0], name="x")
+    self.assertAllInRange(x, 10, 15)
+
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(x, 10, 15, open_lower_bound=True)
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(x, 10, 15, open_upper_bound=True)
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(
+          x, 10, 15, open_lower_bound=True, open_upper_bound=True)
+
+  def testAssertAllInRangeErrorMessageEllipses(self):
+    x_init = np.array([[10.0, 15.0]] * 12)
+    x = constant_op.constant(x_init, name="x")
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(x, 5, 10)
+
+  def testAssertAllInRangeDetectsNaNs(self):
+    x = constant_op.constant(
+        [[np.nan, 0.0], [np.nan, np.inf], [np.inf, np.nan]], name="x")
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(x, 0.0, 2.0)
+
+  def testAssertAllInRangeWithInfinities(self):
+    x = constant_op.constant([10.0, np.inf], name="x")
+    self.assertAllInRange(x, 10, np.inf)
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(x, 10, np.inf, open_upper_bound=True)
+
+  def testAssertAllInSet(self):
+    b = constant_op.constant([True, False], name="b")
+    x = constant_op.constant([13, 37], name="x")
+
+    self.assertAllInSet(b, [False, True])
+    self.assertAllInSet(b, (False, True))
+    self.assertAllInSet(b, {False, True})
+    self.assertAllInSet(x, [0, 13, 37, 42])
+    self.assertAllInSet(x, (0, 13, 37, 42))
+    self.assertAllInSet(x, {0, 13, 37, 42})
+
+    with self.assertRaises(AssertionError):
+      self.assertAllInSet(b, [False])
+    with self.assertRaises(AssertionError):
+      self.assertAllInSet(x, (42,))
+
   def testRandomSeed(self):
     # Call setUp again for WithCApi case (since it makes a new defeault graph
     # after setup).
-- 
GitLab


From 2ca2390277c2a4ea2d92fb72782bf30bfe00f592 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 24 Apr 2018 16:34:01 -0700
Subject: [PATCH 389/450] Fixing the mock import error for devel docker.

---
 tensorflow/tools/docker/Dockerfile.devel     | 1 +
 tensorflow/tools/docker/Dockerfile.devel-gpu | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 390d7442c3..5c49ac1d8d 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -31,6 +31,7 @@ RUN pip --no-cache-dir install \
         ipykernel \
         jupyter \
         matplotlib \
+        mock \
         numpy \
         scipy \
         sklearn \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 293028d229..196227861b 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -40,6 +40,7 @@ RUN pip --no-cache-dir install \
         ipykernel \
         jupyter \
         matplotlib \
+        mock \
         numpy \
         scipy \
         sklearn \
-- 
GitLab


From 2495ec22832c846b149c394aece2db19f2813b45 Mon Sep 17 00:00:00 2001
From: Igor Saprykin <isaprykin@google.com>
Date: Tue, 24 Apr 2018 16:52:29 -0700
Subject: [PATCH 390/450] Disable
 UseTowerEstimatorWithoutReplication.test_train_single_tower.

PiperOrigin-RevId: 194168031
---
 .../estimator/replicate_model_fn_test.py      | 53 -------------------
 1 file changed, 53 deletions(-)

diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
index 144b45982c..dd8a3a95f1 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
@@ -540,59 +540,6 @@ class ReplicateAcrossASingleDeviceWithoutTowerOptimizer(
         self.assertEqual(7.0, session.run(c))
 
 
-class UseTowerEstimatorWithoutReplication(test_util.TensorFlowTestCase):
-
-  def model_fn(self, mode, features, labels, params):
-    c = variable_scope.get_variable(
-        'c',
-        initializer=constant_op.constant(10, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-
-    features = features['features']
-    predictions = math_ops.multiply(features, c)
-
-    loss = losses.absolute_difference(
-        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
-    loss = math_ops.reduce_sum(loss)
-
-    metrics = {
-        'accuracy': metrics_lib.accuracy(labels, predictions),
-        'auc': metrics_lib.auc(labels, predictions)
-    }
-
-    optimizer = replicate_model_fn.TowerOptimizer(
-        gradient_descent.GradientDescentOptimizer(params['learning_rate']))
-
-    return model_fn_lib.EstimatorSpec(
-        mode=mode,
-        loss=loss,
-        eval_metric_ops=metrics,
-        predictions={'probabilities': predictions},
-        train_op=optimizer.minimize(loss))
-
-  @property
-  def params(self):
-    params = {}
-    params['learning_rate'] = 1.0
-    return params
-
-  def test_train_single_tower(self):
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'features': features}, y=labels, batch_size=2, shuffle=False)
-
-    with self.test_session():
-      estimator = estimator_lib.Estimator(
-          model_fn=self.model_fn,
-          model_dir=tempfile.mkdtemp(),
-          params=self.params)
-      estimator.train(train_input_fn, steps=1)
-
-      self.assertEqual(7.0, estimator.get_variable_value('c'))
-
-
 class MakeSureSyncReplicasOptimizerWorks(test_util.TensorFlowTestCase):
 
   def model_fn(self, mode, features, labels, params):
-- 
GitLab


From e871ea871fc39521dfa3c9f659b1d576c835c1e9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 17:02:46 -0700
Subject: [PATCH 391/450] Fixed typo in an error message.

PiperOrigin-RevId: 194169339
---
 tensorflow/core/kernels/string_split_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/string_split_op.cc b/tensorflow/core/kernels/string_split_op.cc
index 9efbd66ef7..4c2b312c34 100644
--- a/tensorflow/core/kernels/string_split_op.cc
+++ b/tensorflow/core/kernels/string_split_op.cc
@@ -71,7 +71,7 @@ class StringSplitOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("delimiter", &delimiter_tensor));
     OP_REQUIRES(
         ctx, TensorShapeUtils::IsScalar(delimiter_tensor->shape()),
-        errors::InvalidArgument("delimiter must scalar, got shape: ",
+        errors::InvalidArgument("delimiter must be a scalar, got shape: ",
                                 delimiter_tensor->shape().DebugString()));
     const auto delimiter_vec = delimiter_tensor->flat<string>();
     const string& delimiter = delimiter_vec(0);
-- 
GitLab


From 8b3c5e62be825d78bc25b3c4b6c65a44d47416e0 Mon Sep 17 00:00:00 2001
From: Akshay Agrawal <akshayka@google.com>
Date: Tue, 24 Apr 2018 17:35:08 -0700
Subject: [PATCH 392/450] `PartitionedCallOp`: An op for executing multi-device
 functions.

A `PartitionedCallOp` allows for execution of functions across multiple devices
but within a single process. It proceeds by placing and partitioning the graph
underlying a given function body, instantiating for each partitioned subgraph a
function. The yielded function shards, which together are equivalent to the
original function, are then executed.

`PartitionedCallOp` is not part of the public TensorFlow API.

PiperOrigin-RevId: 194173114
---
 tensorflow/compiler/jit/BUILD                 |  37 ---
 .../jit/encapsulate_subgraphs_pass.cc         |   2 +-
 .../jit/encapsulate_subgraphs_pass_test.cc    |   2 +-
 tensorflow/compiler/tf2xla/BUILD              |   1 -
 .../tf2xla/functionalize_control_flow.cc      |   2 +-
 tensorflow/core/BUILD                         |   5 +
 .../base_api/api_def_PartitionedCall.pbtxt    |  23 ++
 .../python_api/api_def_PartitionedCall.pbtxt  |   1 +
 .../framework}/graph_to_functiondef.cc        |   4 +-
 .../framework}/graph_to_functiondef.h         |   9 +-
 .../framework}/graph_to_functiondef_test.cc   |   2 +-
 tensorflow/core/kernels/BUILD                 |  12 +
 .../core/kernels/partitioned_function_ops.cc  | 279 ++++++++++++++++++
 tensorflow/core/ops/functional_ops.cc         |   9 +
 tensorflow/python/kernel_tests/BUILD          |   1 +
 .../kernel_tests/functional_ops_test.py       | 106 +++++++
 tensorflow/python/ops/functional_ops.py       |   5 +-
 17 files changed, 450 insertions(+), 50 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_PartitionedCall.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_PartitionedCall.pbtxt
 rename tensorflow/{compiler/jit => core/framework}/graph_to_functiondef.cc (98%)
 rename tensorflow/{compiler/jit => core/framework}/graph_to_functiondef.h (79%)
 rename tensorflow/{compiler/jit => core/framework}/graph_to_functiondef_test.cc (98%)
 create mode 100644 tensorflow/core/kernels/partitioned_function_ops.cc

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 53b124cf89..af2965bba5 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -257,19 +257,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "graph_to_functiondef",
-    srcs = ["graph_to_functiondef.cc"],
-    hdrs = ["graph_to_functiondef.h"],
-    visibility = [":friends"],
-    deps = [
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
 cc_library(
     name = "create_xla_launch_op",
     srcs = [
@@ -300,7 +287,6 @@ cc_library(
     ],
     deps = [
         ":common",
-        ":graph_to_functiondef",
         ":shape_inference_helpers",
         ":union_find",
         "//tensorflow/compiler/jit/graphcycles",
@@ -347,28 +333,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
-    name = "graph_to_functiondef_test",
-    size = "small",
-    srcs = [
-        "graph_to_functiondef_test.cc",
-    ],
-    deps = [
-        ":graph_to_functiondef",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:cc_ops_internal",
-        "//tensorflow/cc:function_ops",
-        "//tensorflow/cc:ops",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 tf_cc_test(
     name = "compilation_passes_test",
     size = "small",
@@ -379,7 +343,6 @@ tf_cc_test(
     deps = [
         ":common",
         ":compilation_passes",
-        ":graph_to_functiondef",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/cc:function_ops",
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 7507e193b5..f06debaf31 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/compiler/jit/graph_to_functiondef.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/legacy_flags/encapsulate_subgraphs_pass_flags.h"
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
@@ -35,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_def_util.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 3502d1bb45..5ec24d39a2 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -20,8 +20,8 @@ limitations under the License.
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/compiler/jit/graph_to_functiondef.h"
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index ba5c3a1484..942504e6bd 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -412,7 +412,6 @@ cc_library(
     hdrs = ["functionalize_control_flow.h"],
     deps = [
         ":tf2xla_util",
-        "//tensorflow/compiler/jit:graph_to_functiondef",
         "//tensorflow/compiler/jit:union_find",
         "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 23629d85ae..8d1f268490 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -21,13 +21,13 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#include "tensorflow/compiler/jit/graph_to_functiondef.h"
 #include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index bda87c6aed..e8f10f148d 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -545,6 +545,7 @@ tf_cuda_library(
         "framework/device_base.h",
         "framework/function.h",
         "framework/graph_def_util.h",
+        "framework/graph_to_functiondef.h",
         "framework/kernel_def_builder.h",
         "framework/log_memory.h",
         "framework/lookup_interface.h",
@@ -999,6 +1000,7 @@ cc_library(
         "//tensorflow/core/kernels:nn",
         "//tensorflow/core/kernels:parameterized_truncated_normal_op",
         "//tensorflow/core/kernels:parsing",
+        "//tensorflow/core/kernels:partitioned_function_ops",
         "//tensorflow/core/kernels:random_ops",
         "//tensorflow/core/kernels:random_poisson_op",
         "//tensorflow/core/kernels:remote_fused_graph_ops",
@@ -3061,6 +3063,7 @@ tf_cc_tests(
         "framework/common_shape_fns_test.cc",
         "framework/function_test.cc",
         "framework/graph_def_util_test.cc",
+        "framework/graph_to_functiondef_test.cc",
         "framework/kernel_def_builder_test.cc",
         "framework/memory_types_test.cc",
         "framework/node_def_builder_test.cc",
@@ -3139,6 +3142,8 @@ tf_cc_tests(
         ":testlib",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
         "//tensorflow/cc:sendrecv_ops",
         "//tensorflow/cc:while_loop",
diff --git a/tensorflow/core/api_def/base_api/api_def_PartitionedCall.pbtxt b/tensorflow/core/api_def/base_api/api_def_PartitionedCall.pbtxt
new file mode 100644
index 0000000000..caf8172a52
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PartitionedCall.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "PartitionedCall"
+  in_arg {
+    name: "args"
+    description: "A list of input tensors."
+  }
+  out_arg {
+    name: "output"
+    description: "A list of return values."
+  }
+  attr { name: "Tin"  description: "A list of input types." }
+  attr { name: "Tout"  description: "A list of output types." }
+  attr {
+    name: "f"
+    description: <<END
+      A function that takes 'args', a list of tensors, and returns 'output',
+      another list of tensors. Input and output types are specified by 'Tin'
+      and 'Tout'. The function body of f will be placed and partitioned across
+      devices, setting this op apart from the regular Call op.
+END
+  }
+  summary: "returns `f(inputs)`, where `f`'s body is placed and partitioned."
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_PartitionedCall.pbtxt b/tensorflow/core/api_def/python_api/api_def_PartitionedCall.pbtxt
new file mode 100644
index 0000000000..c443acd2e9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_PartitionedCall.pbtxt
@@ -0,0 +1 @@
+op { graph_op_name: "PartitionedCall" visibility: HIDDEN }
diff --git a/tensorflow/compiler/jit/graph_to_functiondef.cc b/tensorflow/core/framework/graph_to_functiondef.cc
similarity index 98%
rename from tensorflow/compiler/jit/graph_to_functiondef.cc
rename to tensorflow/core/framework/graph_to_functiondef.cc
index 8f5e11dfa4..4ffa503379 100644
--- a/tensorflow/compiler/jit/graph_to_functiondef.cc
+++ b/tensorflow/core/framework/graph_to_functiondef.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/jit/graph_to_functiondef.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
 
 #include <unordered_map>
 #include <unordered_set>
@@ -111,7 +111,7 @@ string NodeNameMapping::Renormalize(const string& name) const {
 }  // anonymous namespace
 
 // Graph to FunctionDef conversion. This code is closely modeled on the Python
-// code in third_party/tensorflow/python/framework/function.py.
+// code in tensorflow/python/framework/function.py.
 
 Status GraphToFunctionDef(const Graph& graph, const string& name,
                           FunctionDef* fdef) {
diff --git a/tensorflow/compiler/jit/graph_to_functiondef.h b/tensorflow/core/framework/graph_to_functiondef.h
similarity index 79%
rename from tensorflow/compiler/jit/graph_to_functiondef.h
rename to tensorflow/core/framework/graph_to_functiondef.h
index 3e1ae7bbbe..cb0e2b2fbd 100644
--- a/tensorflow/compiler/jit/graph_to_functiondef.h
+++ b/tensorflow/core/framework/graph_to_functiondef.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_JIT_GRAPH_TO_FUNCTIONDEF_H_
-#define TENSORFLOW_COMPILER_JIT_GRAPH_TO_FUNCTIONDEF_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_GRAPH_TO_FUNCTIONDEF_H_
+#define TENSORFLOW_CORE_FRAMEWORK_GRAPH_TO_FUNCTIONDEF_H_
 
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/graph/graph.h"
@@ -23,11 +23,10 @@ limitations under the License.
 namespace tensorflow {
 
 // Converts 'graph' to a FunctionDef 'fdef', with name 'name'.
-// Closely modeled on the Python code in
-// third_party/tensorflow/python/framework/function.py
+// Closely modeled on the Python code in tensorflow/python/framework/function.py
 Status GraphToFunctionDef(const Graph& graph, const string& name,
                           FunctionDef* fdef);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMPILER_JIT_GRAPH_TO_FUNCTIONDEF_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_GRAPH_TO_FUNCTIONDEF_H_
diff --git a/tensorflow/compiler/jit/graph_to_functiondef_test.cc b/tensorflow/core/framework/graph_to_functiondef_test.cc
similarity index 98%
rename from tensorflow/compiler/jit/graph_to_functiondef_test.cc
rename to tensorflow/core/framework/graph_to_functiondef_test.cc
index 676db7c4dd..587e2c07ac 100644
--- a/tensorflow/compiler/jit/graph_to_functiondef_test.cc
+++ b/tensorflow/core/framework/graph_to_functiondef_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/jit/graph_to_functiondef.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 201cd35798..f715cddfa6 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2038,6 +2038,17 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "partitioned_function_ops",
+    prefix = "partitioned_function_ops",
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "image",
     deps = [
@@ -5153,6 +5164,7 @@ filegroup(
             "decode_proto_op.cc",
             "encode_proto_op.cc",
             "rpc_op.cc",
+            "partitioned_function_ops.cc",
             # Excluded due to experimental status:
             "debug_ops.*",
             "scatter_nd_op*",
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
new file mode 100644
index 0000000000..d66b1ba663
--- /dev/null
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -0,0 +1,279 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/placer.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_partition.h"
+#include "tensorflow/core/util/reffed_status_callback.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/stream.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+typedef FunctionLibraryRuntime::Handle FHandle;
+
+namespace {
+
+// A `PartitionedCallOp` asynchronously executes a function, potentially across
+// multiple devices but within a single process. The kernel places and
+// partitions a given function's underlying graph, and executes each of the
+// partitioned subgraphs as a function.
+//
+// TODO(akshayka): Support distributed execution.
+class PartitionedCallOp : public AsyncOpKernel {
+ public:
+  explicit PartitionedCallOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+  }
+
+  ~PartitionedCallOp() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    FunctionLibraryRuntime* lib = ctx->function_library();
+    OP_REQUIRES_ASYNC(ctx, lib != nullptr,
+                      errors::Internal("No function library is provided."),
+                      done);
+
+    // The function body's graph is placed and partitioned the first time
+    // `ComputeAsync` is invoked; every subsequent invocation calls each
+    // of the function shards yielded by partitioning.
+    //
+    // The partitioning step yields a set of devices on which to run the
+    // function, and exactly one function shard is created for each device
+    // Inputs and outputs are pinned to the local device, for simplicity.
+    //
+    // TODO(akshayka): Support re-sharding the function on subsequent calls,
+    // via, e.g., virtual device annotations and a list of device names supplied
+    // through an attribute.
+    //
+    // TODO(akshayka): Lift the constraint pinning inputs and outputs to the
+    // local device.
+    //
+    // TODO(akshayka): Add a fastpath for functions that execute on a single
+    // device.
+    {
+      mutex_lock l(mu_);
+      if (!partitioned_) {
+        // Instantiate the function to obtain its underlying graph, complete
+        // with nodes for arguments and return values.
+        FunctionLibraryRuntime::InstantiateOptions opts;
+        FHandle handle;
+        OP_REQUIRES_OK_ASYNC(
+            ctx,
+            lib->Instantiate(func_.name(), AttrSlice(&func_.attr()), opts,
+                             &handle),
+            done);
+        Graph* graph = lib->GetFunctionBody(handle)->graph;
+
+        // Pin the inputs and outputs to the local device to simplify the
+        // function-dispatching logic.
+        local_device_name_ = lib->device()->name();
+        for (Node* node : graph->op_nodes()) {
+          string node_type = node->type_string();
+          if (node_type == FunctionLibraryDefinition::kArgOp ||
+              node_type == FunctionLibraryDefinition::kRetOp) {
+            node->set_assigned_device_name(local_device_name_);
+          }
+        }
+
+        // Place the graph, i.e,. assign a device to every node in it.
+        DeviceSet device_set;
+        for (auto d : lib->device_mgr()->ListDevices()) {
+          device_set.AddDevice(d);
+        }
+        Placer placer(graph, &device_set);
+        OP_REQUIRES_OK_ASYNC(ctx, placer.Run(), done);
+
+        // Partition the graph into subgraphs: exactly one subgraph per device.
+        //
+        // TODO(akshayka): Let devices rewrite their graphs.
+        PartitionOptions partition_options;
+        partition_options.node_to_loc = [](const Node* node) {
+          // TODO(akshayka): To better support the distributed case, first split
+          // the graph by worker (e.g,. using the master session's
+          // `SplitByWorker` policy), and then recursively partition the
+          // per-worker shards at the remote worker(s).
+          return node->assigned_device_name();
+        };
+        int64 edge_name_counter = 0;
+        partition_options.new_name =
+            [&edge_name_counter](const string& prefix) {
+              return strings::StrCat(prefix, "/_", ++edge_name_counter);
+            };
+        partition_options.get_incarnation =
+            [&device_set](const string& name) -> int64 {
+          const Device* d = device_set.FindDeviceByName(name);
+          if (d == nullptr) {
+            return PartitionOptions::kIllegalIncarnation;
+          } else {
+            return d->attributes().incarnation();
+          }
+        };
+        partition_options.control_flow_added = false;
+        std::unordered_map<string, GraphDef> partitions;
+        OP_REQUIRES_OK_ASYNC(
+            ctx, Partition(partition_options, graph, &partitions), done);
+
+        VLOG(3) << "Partitioned function '" << func_.name() << "', yielding "
+                << partitions.size() << " shards.";
+
+        // `subgraphs` is a map from devices to their corresponding subgraphs.
+        gtl::FlatMap<string, std::unique_ptr<Graph>> subgraphs;
+        const FunctionLibraryDefinition* flib_def = &graph->flib_def();
+        for (const auto& partition : partitions) {
+          std::unique_ptr<Graph> subgraph(new Graph(flib_def));
+          GraphConstructorOptions opts;
+          opts.allow_internal_ops = true;
+          opts.expect_device_spec = true;
+          const string& device = partition.first;
+          const GraphDef& graph_def = partition.second;
+          OP_REQUIRES_OK_ASYNC(
+              ctx, ConvertGraphDefToGraph(opts, graph_def, subgraph.get()),
+              done);
+          subgraphs.emplace(device, std::move(subgraph));
+        }
+
+        // The FunctionLibraryRuntime's library cannot be mutated from within
+        // an OpKernel, so the functions are instantiated in an overlay library.
+        overlay_lib_.reset(new FunctionLibraryDefinition(
+            *lib->GetFunctionLibraryDefinition()));
+        for (const auto& pair : subgraphs) {
+          const string& target = pair.first;
+          Graph* subgraph = pair.second.get();
+          FunctionDef shard;
+          string unique_name = UniquifyFunctionName(func_.name());
+          OP_REQUIRES_OK_ASYNC(
+              ctx, GraphToFunctionDef(*subgraph, unique_name, &shard), done);
+          OP_REQUIRES_OK_ASYNC(ctx, overlay_lib_->AddFunctionDef(shard), done);
+          FunctionLibraryRuntime::InstantiateOptions opts;
+          opts.target = target;
+          opts.overlay_lib = overlay_lib_.get();
+          FHandle handle;
+          OP_REQUIRES_OK_ASYNC(
+              ctx,
+              lib->Instantiate(unique_name, AttrSlice(&shard.attr()), opts,
+                               &handle),
+              done);
+          device_handle_map_.emplace(target, handle);
+        }
+        partitioned_ = true;
+      }
+    }
+
+    FunctionLibraryRuntime::Options opts;
+    opts.step_id = ctx->step_id();
+    opts.step_container = ctx->step_container();
+    opts.cancellation_manager = ctx->cancellation_manager();
+    opts.stats_collector = ctx->stats_collector();
+    // TODO(akshayka): Consider selecting a runner on a per-device basis, i.e.,
+    // using device-specific threadpools when available.
+    opts.runner = ctx->runner();
+    opts.source_device = local_device_name_;
+    // TODO(akshayka): Accommodate the multiple-worker scenario by adding the
+    // constructed rendezvous to a rendezvous manager.
+    Rendezvous* rendez = new IntraProcessRendezvous(lib->device_mgr());
+    opts.rendezvous = rendez;
+
+    OpInputList arguments;
+    OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("args", &arguments), done);
+    // Dummy args vector for the remote shards, which do not have inputs.
+    std::vector<Tensor> dummy_args;
+
+    StatusCallback callback = std::bind(
+        [](Rendezvous* rendez, DoneCallback& done, const Status& status) {
+          rendez->Unref();
+          done();
+        },
+        rendez, std::move(done), std::placeholders::_1);
+    auto* refcounted_done = new ReffedStatusCallback(std::move(callback));
+    for (int i = 1; i < device_handle_map_.size(); ++i) {
+      refcounted_done->Ref();
+    }
+
+    for (const auto& pair : device_handle_map_) {
+      const string& target_device = pair.first;
+      FHandle handle = pair.second;
+      VLOG(3) << "Running function shard on device " << target_device;
+      if (target_device == local_device_name_) {
+        opts.remote_execution = false;
+        std::vector<Tensor> args;
+        args.reserve(arguments.size());
+        for (const Tensor& argument : arguments) {
+          args.push_back(argument);
+        }
+        auto* rets = new std::vector<Tensor>;
+        lib->Run(opts, handle, args, rets,
+                 [rets, refcounted_done, ctx](const Status& status) {
+                   if (!status.ok()) {
+                     ctx->SetStatus(status);
+                   } else {
+                     for (int i = 0; i < rets->size(); ++i) {
+                       ctx->set_output(i, (*rets)[i]);
+                     }
+                   }
+                   delete rets;
+                   refcounted_done->Unref();
+                 });
+      } else {
+        opts.remote_execution = true;
+        std::vector<Tensor>* dummy_rets = new std::vector<Tensor>;
+        lib->Run(opts, handle, dummy_args, dummy_rets,
+                 [dummy_rets, refcounted_done, ctx](const Status& status) {
+                   if (!status.ok()) {
+                     ctx->SetStatus(status);
+                   }
+                   delete dummy_rets;
+                   refcounted_done->Unref();
+                 });
+      }
+    }
+  }
+
+ private:
+  string UniquifyFunctionName(const string& name) {
+    for (;; ++suffix_) {
+      const string candidate = strings::StrCat(name, "_", suffix_);
+      if (overlay_lib_->Find(candidate) == nullptr) {
+        return candidate;
+      }
+    }
+  }
+
+  // `func_` encapsulates the original, unsharded function.
+  NameAttrList func_;
+  string local_device_name_;
+  // Function shards are added to `overlay_lib_`.
+  std::unique_ptr<FunctionLibraryDefinition> overlay_lib_;
+  // A map from device names to handles of function shards.
+  gtl::FlatMap<string, FHandle> device_handle_map_;
+
+  mutex mu_;
+  bool partitioned_ GUARDED_BY(mu_) = false;
+
+  // Used to uniquify function names in `overlay_lib_`.
+  uint32 suffix_ = 0;
+};
+REGISTER_KERNEL_BUILDER(Name("PartitionedCall").Device(DEVICE_CPU),
+                        PartitionedCallOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index 792686cae1..4d4a370478 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -145,4 +145,13 @@ REGISTER_OP("For")
     .Attr("body: func")
     .SetShapeFn(shape_inference::UnknownShape);
 
+// TODO(b/73826847, b/37549631) Mark as stateful.
+REGISTER_OP("PartitionedCall")
+    .Input("args: Tin")
+    .Output("output: Tout")
+    .Attr("Tin: list(type) >= 0")
+    .Attr("Tout: list(type) >= 0")
+    .Attr("f: func")
+    .SetShapeFn(shape_inference::UnknownShape);
+
 }  // end namespace tensorflow
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index c03c514699..b4ff094cdf 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1566,6 +1566,7 @@ cuda_py_test(
         "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:iterator_ops",
     ],
     grpc_enabled = True,
     tags = ["no_windows"],
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 5f48be94da..35a274e75f 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -38,6 +39,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat
 
 
 # pylint: disable=invalid-name
@@ -925,6 +927,110 @@ class FunctionalOpsTest(test.TestCase):
       self.assertAllEqual(sess.run(bvals), [17., 16.])
 
 
+class PartitionedCallTest(test.TestCase):
+
+  def testBasicSingleDevice(self):
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Body(x, y):
+      with ops.device("/cpu:0"):
+        a = x + x
+        b = y + y
+        return a + b
+
+    output, = self.evaluate(
+        functional_ops.partitioned_call(
+            args=[constant_op.constant(1.),
+                  constant_op.constant(2.)], f=Body))
+    self.assertEqual(output, 6.)
+
+  def testBasicMultiDevice(self):
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Body(x, y):
+      # if x = 1, y = 2, ...
+      with ops.device("/cpu:0"):
+        # a:= 1 + 1 = 2
+        a = x + x
+      with ops.device("/cpu:1"):
+        # b:= 2 + 2 = 4
+        b = a + y
+      with ops.device("/cpu:2"):
+        # c:= 2 + 4 = 6
+        c = a + b
+      # a + b + c = 2 + 4 + 6 = 12
+      return a + b + c
+
+    with self.test_session(config=config):
+      output, = functional_ops.partitioned_call(
+          args=[constant_op.constant(1.),
+                constant_op.constant(2.)], f=Body)
+      self.assertEqual(output.eval(), 12.)
+
+  def testBasicMultiDeviceGPU(self):
+    if not test_util.is_gpu_available():
+      return
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Body(x, y):
+      with ops.device("/gpu:0"):
+        a = x + x
+        b = y + y
+      with ops.device("/cpu:0"):
+        c = a + b
+        return c
+
+    output, = self.evaluate(
+        functional_ops.partitioned_call(
+            args=[constant_op.constant(1.),
+                  constant_op.constant(2.)], f=Body))
+    self.assertEqual(output, 6.)
+
+  def testBasicNoDeviceAnnotations(self):
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Body(x, y):
+      a = x + x
+      b = y + y
+      return a + b
+
+    output, = self.evaluate(
+        functional_ops.partitioned_call(
+            args=[constant_op.constant(1.),
+                  constant_op.constant(2.)], f=Body))
+    self.assertEqual(output, 6.)
+
+  def testShardsRunOnRequestedDevices(self):
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+
+    @function.Defun()
+    def Body():
+      # Serialize DT_RESOURCE handles as DT_STRINGs, which encode the device on
+      # which the resource was created, so that we can verify that ops were
+      # actually run on the requested devices.
+      #
+      # TODO(akshayka): Provide a cleaner, more idiomatic API for obtaining the
+      # name of the device on which a resource lives / for determining the
+      # device on which an op ran.
+      with ops.device("/cpu:0"):
+        s1 = iterator_ops.Iterator.from_structure(
+            (dtypes.float32,)).string_handle()
+      with ops.device("/cpu:1"):
+        s2 = iterator_ops.Iterator.from_structure(
+            (dtypes.float32,)).string_handle()
+      with ops.device("/cpu:2"):
+        s3 = iterator_ops.Iterator.from_structure(
+            (dtypes.float32,)).string_handle()
+      return s1, s2, s3
+
+    with self.test_session(config=config):
+      outputs = functional_ops.partitioned_call(args=[], f=Body)
+      self.assertTrue(compat.as_bytes("CPU:0") in outputs[0].eval())
+      self.assertTrue(compat.as_bytes("CPU:1") in outputs[1].eval())
+      self.assertTrue(compat.as_bytes("CPU:2") in outputs[2].eval())
+
+
 if __name__ == "__main__":
   test.main()
 
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 1b3a1e5cbc..765a2ef993 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -927,6 +927,9 @@ def For(start,
     output_attr.list.i.extend(hostmem)
     ret[0].op._set_attr("_output_hostmem", output_attr)  # pylint: disable=protected-access
   return ret
+# pylint: enable=invalid-name,protected-access
 
 
-# pylint: enable=invalid-name,protected-access
+def partitioned_call(args, f):
+  return gen_functional_ops.partitioned_call(
+      args=args, Tout=[o.type for o in f.definition.signature.output_arg], f=f)
-- 
GitLab


From 44c9c3a170e8919d075d3e8a7c5c5ac7ac5dee43 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Tue, 24 Apr 2018 17:45:22 -0700
Subject: [PATCH 393/450] Fix critical bug in metrics computation in Eager with
 Model.

PiperOrigin-RevId: 194174256
---
 .../_impl/keras/engine/training_eager.py      |  2 +-
 .../_impl/keras/engine/training_eager_test.py | 22 +++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
index ad239d6151..34adeb7599 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
@@ -96,7 +96,7 @@ def _eager_metrics_fn(model, outputs, targets):
           model.metrics_names.append(metric_name)
 
       with backend.name_scope(metric_name):
-        metric_result = metric_fn(outputs[i], targets[i])
+        metric_result = metric_fn(targets[i], outputs[i])
         metric_names.append(metric_name)
         metric_results.append(backend.mean(metric_result))
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
index deaf1d1306..5adb3ef940 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
@@ -625,6 +625,7 @@ class LossWeightingTest(test.TestCase):
       bad_w_np = np.random.random((10, 2, 2))
       model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np})
 
+
 class CorrectnessTest(test.TestCase):
 
   @tf_test_util.run_in_graph_and_eager_modes()
@@ -648,6 +649,27 @@ class CorrectnessTest(test.TestCase):
     self.assertEqual(
         np.around(history.history['loss'][-1], decimals=4), 0.6173)
 
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_metrics_correctness(self):
+    model = keras.Sequential()
+    model.add(keras.layers.Dense(3,
+                                 activation='relu',
+                                 input_dim=4,
+                                 kernel_initializer='ones'))
+    model.add(keras.layers.Dense(1,
+                                 activation='sigmoid',
+                                 kernel_initializer='ones'))
+    model.compile(loss='mae',
+                  metrics=['acc'],
+                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+    x = np.ones((100, 4))
+    y = np.ones((100, 1))
+    outs = model.evaluate(x, y)
+    self.assertEqual(outs[1], 1.)
+    y = np.zeros((100, 1))
+    outs = model.evaluate(x, y)
+    self.assertEqual(outs[1], 0.)
+
 if __name__ == '__main__':
   ops.enable_eager_execution()
   test.main()
-- 
GitLab


From 29ac285556cba696a15a4b6cd40034f9eb82e1fd Mon Sep 17 00:00:00 2001
From: "David G. Andersen" <dga@google.com>
Date: Tue, 24 Apr 2018 18:10:22 -0700
Subject: [PATCH 394/450] Add a fuzzer for DecodeWav. Fix an OOM error in which
 a wav file could claim to be larger than it really is, resulting in a large
 memory allocation prior to the size mismatch being detected later.

PiperOrigin-RevId: 194176822
---
 tensorflow/core/kernels/fuzzing/BUILD         |  2 ++
 .../core/kernels/fuzzing/decode_wav_fuzz.cc   | 30 +++++++++++++++++++
 tensorflow/core/lib/wav/wav_io.cc             |  6 ++++
 3 files changed, 38 insertions(+)
 create mode 100644 tensorflow/core/kernels/fuzzing/decode_wav_fuzz.cc

diff --git a/tensorflow/core/kernels/fuzzing/BUILD b/tensorflow/core/kernels/fuzzing/BUILD
index aab4b009b5..8bfa40304e 100644
--- a/tensorflow/core/kernels/fuzzing/BUILD
+++ b/tensorflow/core/kernels/fuzzing/BUILD
@@ -37,6 +37,8 @@ tf_ops_fuzz_target_lib("decode_png")
 
 tf_ops_fuzz_target_lib("decode_jpeg")
 
+tf_ops_fuzz_target_lib("decode_wav")
+
 tf_ops_fuzz_target_lib("example_proto_fast_parsing")
 
 tf_ops_fuzz_target_lib("parse_tensor_op")
diff --git a/tensorflow/core/kernels/fuzzing/decode_wav_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_wav_fuzz.cc
new file mode 100644
index 0000000000..33a11d8e13
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/decode_wav_fuzz.cc
@@ -0,0 +1,30 @@
+/* Copyright 2018 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/audio_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+class FuzzDecodeWav : public FuzzStringInputOp {
+  SINGLE_INPUT_OP_BUILDER(DT_STRING, DecodeWav);
+};
+
+STANDARD_TF_FUZZ_FUNCTION(FuzzDecodeWav);
+
+}  // namespace fuzzing
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index 3f7dbcee85..36d939e061 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -285,6 +285,12 @@ Status DecodeLin16WaveAsFloatVector(const string& wav_string,
       was_data_found = true;
       *sample_count = chunk_size / bytes_per_sample;
       const uint32 data_count = *sample_count * *channel_count;
+      int unused_new_offset = 0;
+      // Validate that the data exists before allocating space for it
+      // (prevent easy OOM errors).
+      TF_RETURN_IF_ERROR(IncrementOffset(offset, sizeof(int16) * data_count,
+                                         wav_string.size(),
+                                         &unused_new_offset));
       float_values->resize(data_count);
       for (int i = 0; i < data_count; ++i) {
         int16 single_channel_value = 0;
-- 
GitLab


From fc726b3e55b8e98f94491e8cf9d3e5114511f219 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 18:33:25 -0700
Subject: [PATCH 395/450] Fix header guard.

PiperOrigin-RevId: 194178828
---
 .../lite/kernels/internal/optimized/tensor_utils_impl.h     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
index 4e324a5e10..ff15f3e3b1 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
-#define TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
 
 // TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
@@ -135,4 +135,4 @@ void NeonReductionSumVector(const float* input_vector, float* output_vector,
 }  // namespace tensor_utils
 }  // namespace tflite
 
-#endif  // TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
-- 
GitLab


From 67af43120598bbc30cab34ee5f8d8f8a0aaa1617 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 18:51:45 -0700
Subject: [PATCH 396/450] Enable concat hoisting optimization by default.

PiperOrigin-RevId: 194180248
---
 tensorflow/core/grappler/op_types.cc           | 18 ++++++++++++------
 .../grappler/optimizers/arithmetic_optimizer.h |  5 +----
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index f595cf6456..8c184e7096 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -506,47 +506,53 @@ bool IsUnaryElementWise(const NodeDef& node) {
           "Atan2",
           "Atanh",
           "Ceil",
+          "CheckNumerics",
           "ComplexAbs",
           "Conj",
           "Cos",
           "Cosh",
+          "DebugGradientIdentity",
+          "DeepCopy"
           "Digamma",
           "Elu"
+          "Enter",
           "Erf",
           "Erfc",
+          "Exit",
           "Exp",
           "Expm1",
           "Floor",
+          "Identity",
           "Inv",
           "Invert",
-          "Isinf",
-          "Isnan",
-          "Isfinite",
           "Lgamma",
           "Log",
           "Log1p",
           "LogicalNot",
           "Neg",
+          "PreventGradient",
+          "Print",
           "Reciprocal",
           "Relu",
           "Relu6",
           "Rint",
           "Round",
-          "Selu",
           "Rsqrt",
+          "Selu",
           "Sigmoid",
           "Sign",
           "Sin",
           "SinH",
+          "Snapshot",
           "Softplus",
           "Softsign",
           "Sqrt",
           "Square",
+          "StopGradient",
           "Tan"
           "Tanh",
       }));
-  return element_wise_ops->count(node.op()) > 0 ||
-         (!IsIdentityN(node) && IsValueAndOrderPreserving(node));
+  return element_wise_ops->find(node.op()) != element_wise_ops->end();
 }
 
 bool HasOpDef(const NodeDef& node) {
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 375f13acc1..c299bd030b 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -65,16 +65,13 @@ class ArithmeticOptimizer : public GraphOptimizer {
     bool remove_redundant_bitcast = true;
     bool remove_redundant_cast = true;
     bool remove_negation = true;
-    bool hoist_unary_out_of_concat = false;
+    bool hoist_unary_out_of_concat = true;
 
     // Choose which arithmetic optimizer stages will be enabled for a given
     // optimization level by default.
     static ArithmeticOptimizerOptions Default(
         RewriterConfig::Toggle opt_level) {
       ArithmeticOptimizerOptions options;
-      if (opt_level == RewriterConfig::AGGRESSIVE) {
-        options.hoist_unary_out_of_concat = true;
-      }
       return options;
     }
   };
-- 
GitLab


From 928913add3a28c0345a47c3655cf4c0e8a424ad4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 19:18:48 -0700
Subject: [PATCH 397/450] Update ops-related pbtxt files.

PiperOrigin-RevId: 194182374
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 25 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 25 +++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 701897f162..f15e020346 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -35624,6 +35624,31 @@ op {
     type: "type"
   }
 }
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
 op {
   name: "Placeholder"
   output_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index eb43c6fdfb..0211b16b51 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -17436,6 +17436,31 @@ op {
     type: "type"
   }
 }
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
 op {
   name: "Placeholder"
   output_arg {
-- 
GitLab


From 71f07d4f26ddf47fc2d6a7c3b4146be91991a0f2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 24 Apr 2018 20:15:24 -0700
Subject: [PATCH 398/450] Automated g4 rollback of changelist 194180248

PiperOrigin-RevId: 194186080
---
 tensorflow/core/grappler/op_types.cc           | 18 ++++++------------
 .../grappler/optimizers/arithmetic_optimizer.h |  5 ++++-
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 8c184e7096..f595cf6456 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -506,53 +506,47 @@ bool IsUnaryElementWise(const NodeDef& node) {
           "Atan2",
           "Atanh",
           "Ceil",
-          "CheckNumerics",
           "ComplexAbs",
           "Conj",
           "Cos",
           "Cosh",
-          "DebugGradientIdentity",
-          "DeepCopy"
           "Digamma",
           "Elu"
-          "Enter",
           "Erf",
           "Erfc",
-          "Exit",
           "Exp",
           "Expm1",
           "Floor",
-          "Identity",
           "Inv",
           "Invert",
+          "Isinf",
+          "Isnan",
+          "Isfinite",
           "Lgamma",
           "Log",
           "Log1p",
           "LogicalNot",
           "Neg",
-          "PreventGradient",
-          "Print",
           "Reciprocal",
           "Relu",
           "Relu6",
           "Rint",
           "Round",
-          "Rsqrt",
           "Selu",
+          "Rsqrt",
           "Sigmoid",
           "Sign",
           "Sin",
           "SinH",
-          "Snapshot",
           "Softplus",
           "Softsign",
           "Sqrt",
           "Square",
-          "StopGradient",
           "Tan"
           "Tanh",
       }));
-  return element_wise_ops->find(node.op()) != element_wise_ops->end();
+  return element_wise_ops->count(node.op()) > 0 ||
+         (!IsIdentityN(node) && IsValueAndOrderPreserving(node));
 }
 
 bool HasOpDef(const NodeDef& node) {
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index c299bd030b..375f13acc1 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -65,13 +65,16 @@ class ArithmeticOptimizer : public GraphOptimizer {
     bool remove_redundant_bitcast = true;
     bool remove_redundant_cast = true;
     bool remove_negation = true;
-    bool hoist_unary_out_of_concat = true;
+    bool hoist_unary_out_of_concat = false;
 
     // Choose which arithmetic optimizer stages will be enabled for a given
     // optimization level by default.
     static ArithmeticOptimizerOptions Default(
         RewriterConfig::Toggle opt_level) {
       ArithmeticOptimizerOptions options;
+      if (opt_level == RewriterConfig::AGGRESSIVE) {
+        options.hoist_unary_out_of_concat = true;
+      }
       return options;
     }
   };
-- 
GitLab


From 8d9f4990f79b75866128c9d36a8c4750c6bfc566 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 00:03:28 -0700
Subject: [PATCH 399/450] Make ProcessArgMaxOperator faster by reserving the
 correct output dims.

PiperOrigin-RevId: 194200078
---
 .../lite/toco/graph_transformations/propagate_fixed_sizes.cc    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 7946492633..be6e0e07dd 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1501,7 +1501,7 @@ void ProcessArgMaxOperator(Model* model, ArgMaxOperator* op) {
   const std::vector<int>& input_dims = input_array.shape().dims();
   std::vector<int> output_dims;
 
-  output_dims.reserve(input_dims.size() - 1);
+  output_dims.reserve(input_dims.size());
   for (int i = 0; i < input_dims.size() - 1; ++i) {
     output_dims.push_back(input_dims[i]);
   }
-- 
GitLab


From 6ce5616e3501f28c6a16db2c60641ff2cd9cabdf Mon Sep 17 00:00:00 2001
From: Junpeng Lao <junpeng.lao@unifr.ch>
Date: Wed, 25 Apr 2018 10:57:46 +0200
Subject: [PATCH 400/450] fix pylint error

---
 .../python/kernel_tests/bijectors/ordered_test.py             | 2 +-
 .../contrib/distributions/python/ops/bijectors/ordered.py     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
index 721dba9c3a..a5f5219588 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
@@ -34,7 +34,7 @@ class OrderedBijectorTest(test.TestCase):
   """Tests correctness of the ordered transformation."""
 
   def setUp(self):
-      self._rng = np.random.RandomState(42)
+    self._rng = np.random.RandomState(42)
 
   @test_util.run_in_graph_and_eager_modes()
   def testBijectorVector(self):
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
index a180f1df0c..eb1eebd4ca 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
@@ -113,8 +113,8 @@ class Ordered(bijector.Bijector):
   def _forward_log_det_jacobian(self, x):
     x = self._maybe_assert_valid_x(x)
     return -math_ops.reduce_sum(
-      math_ops.log(x[..., 1:] - x[..., :-1]),
-      axis=-1)
+        math_ops.log(x[..., 1:] - x[..., :-1]),
+        axis=-1)
 
   def _maybe_assert_valid_x(self, x):
     if not self.validate_args:
-- 
GitLab


From ffd3499094b6201169113eb4db6ae7409a9f0e2e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 02:50:31 -0700
Subject: [PATCH 401/450] Try to ease the timeout issue in Windows Bazel Build

PiperOrigin-RevId: 194215622
---
 tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 8b7495b3b8..a2300811bb 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -87,12 +87,16 @@ create_python_test_dir "${PY_TEST_DIR}"
 PIP_NAME=$(ls ${PY_TEST_DIR}/tensorflow-*.whl)
 reinstall_tensorflow_pip ${PIP_NAME}
 
+# NUMBER_OF_PROCESSORS is predefined on Windows
+N_JOBS="${NUMBER_OF_PROCESSORS}"
+
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
 bazel test -c opt -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_oss \
   --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \
+  --jobs="${N_JOBS}" --test_timeout="300,450,1200,3600" \
   --flaky_test_attempts=3 \
   //${PY_TEST_DIR}/tensorflow/python/... \
   //${PY_TEST_DIR}/tensorflow/contrib/...
-- 
GitLab


From 3cd2d2a66bb296cbc97be9d7cea6c9bdded60a8c Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Wed, 25 Apr 2018 03:07:04 -0700
Subject: [PATCH 402/450] Make CriticalSection work inside a Dataset with eager
 execution enabled.

tf.colocate_with() might be provided with eager tensors when
constructing TensorFlow functions (like the subgraph for map()
inside a tf.data.Dataset).

Prior to this change, the added test would fail with:
"Tensor.op is meaningless when eager execution is enabled."

PiperOrigin-RevId: 194217166
---
 tensorflow/contrib/framework/BUILD            |  2 ++
 .../python/ops/critical_section_test.py       | 21 +++++++++++++++++++
 tensorflow/python/framework/ops.py            |  2 +-
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index f675cc0cf0..249debbdf6 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -178,6 +178,8 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
     ],
 )
 
diff --git a/tensorflow/contrib/framework/python/ops/critical_section_test.py b/tensorflow/contrib/framework/python/ops/critical_section_test.py
index ba660295cb..df7d7e9dae 100644
--- a/tensorflow/contrib/framework/python/ops/critical_section_test.py
+++ b/tensorflow/contrib/framework/python/ops/critical_section_test.py
@@ -19,6 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.framework.python.ops import critical_section_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -330,6 +332,25 @@ class CriticalSectionTest(test.TestCase):
     self.evaluate(v.initializer)
     self.assertEqual(10, self.evaluate(out))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testInsideFunction(self):
+    cs = critical_section_ops.CriticalSection()
+    v = resource_variable_ops.ResourceVariable(1)
+    def fn():
+      return v.read_value()
+
+    # map() creates a TensorFlow function.
+    ds = dataset_ops.Dataset.range(1).map(lambda _: cs.execute(fn))
+
+    def get_first():
+      if context.executing_eagerly():
+        return self.evaluate(ds.make_one_shot_iterator().get_next())
+      itr = ds.make_initializable_iterator()
+      self.evaluate([v.initializer, itr.initializer])
+      return self.evaluate(itr.get_next())
+
+    self.assertEqual(1, get_first())
+
   # TODO(ebrevdo): Re-enable once CriticalSection is in core.
   #
   # def testCriticalSectionAndExecuteOpSaverRoundTrip(self):
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 16a8c575c6..dd9acdd9eb 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -4998,7 +4998,7 @@ def _colocate_with_for_gradient(op, gradient_uid, ignore_existing=False):
     default_graph = get_default_graph()
     if isinstance(op, EagerTensor):
       if default_graph.building_function:
-        op = internal_convert_to_tensor(op)
+        return default_graph.device(op.device)
       else:
         raise ValueError("Encountered an Eager-defined Tensor during graph "
                          "construction, but a function was not being built.")
-- 
GitLab


From 7e11a0d44cca144544ee1f038e3b0f331afbf6e5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 03:07:12 -0700
Subject: [PATCH 403/450] Fix reference computation in client_library_test_base
 for multi device case

The constructor taking an execution option to specify multiple devices
didn't created a reference client resoulting in a SEGV and then later it
tried to execute the reference computation on multiple devices on the
interpreter what isn't supported.

PiperOrigin-RevId: 194217186
---
 tensorflow/compiler/xla/tests/client_library_test_base.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index c09a6d71c9..541de5ae3b 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -61,6 +61,11 @@ ClientLibraryTestBase::ClientLibraryTestBase(
     : client_(GetOrCreateLocalClientOrDie(client_options)),
       execution_options_(CreateDefaultExecutionOptions()) {
   CHECK_EQ(platform, client_options.platform());
+
+  LocalClientOptions ref_options;
+  ref_options.set_platform(GetReferencePlatform());
+  ref_client_ = GetOrCreateLocalClientOrDie(ref_options);
+
   // Disabling constant_folding so that tests (usually written using Constants)
   // will exercise the intended code paths, instead of being constant folded.
   //
@@ -152,6 +157,7 @@ ClientLibraryTestBase::ExecuteAndTransferReference(
     *execution_options.mutable_shape_with_output_layout() =
         *shape_with_output_layout;
   }
+  execution_options.clear_device_handles();
   return ref_client_->ExecuteAndTransfer(computation, arguments,
                                          &execution_options);
 }
-- 
GitLab


From 84c0b932e3fcac6ecd6b4c9459c39c55cb225d2f Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 25 Apr 2018 05:59:33 -0700
Subject: [PATCH 404/450] Add a try/finally to context managers.

This way the state is always reset if an error occurs after the context manager yields.

PiperOrigin-RevId: 194228925
---
 tensorflow/python/ops/summary_ops_v2.py | 42 +++++++++++++++----------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/tensorflow/python/ops/summary_ops_v2.py b/tensorflow/python/ops/summary_ops_v2.py
index 12f361c513..b80f84eb7c 100644
--- a/tensorflow/python/ops/summary_ops_v2.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -74,10 +74,12 @@ def record_summaries_every_n_global_steps(n, global_step=None):
     global_step = training_util.get_or_create_global_step()
   collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME)
   old = collection_ref[:]
-  with ops.device("cpu:0"):
-    collection_ref[:] = [math_ops.equal(global_step % n, 0)]
-  yield
-  collection_ref[:] = old
+  try:
+    with ops.device("cpu:0"):
+      collection_ref[:] = [math_ops.equal(global_step % n, 0)]
+    yield
+  finally:
+    collection_ref[:] = old
 
 
 @tf_contextlib.contextmanager
@@ -85,9 +87,11 @@ def always_record_summaries():
   """Sets the should_record_summaries Tensor to always true."""
   collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME)
   old = collection_ref[:]
-  collection_ref[:] = [True]
-  yield
-  collection_ref[:] = old
+  try:
+    collection_ref[:] = [True]
+    yield
+  finally:
+    collection_ref[:] = old
 
 
 @tf_contextlib.contextmanager
@@ -95,9 +99,11 @@ def never_record_summaries():
   """Sets the should_record_summaries Tensor to always false."""
   collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME)
   old = collection_ref[:]
-  collection_ref[:] = [False]
-  yield
-  collection_ref[:] = old
+  try:
+    collection_ref[:] = [False]
+    yield
+  finally:
+    collection_ref[:] = old
 
 
 class SummaryWriter(object):
@@ -127,12 +133,16 @@ class SummaryWriter(object):
       yield self
     else:
       old = context.context().summary_writer_resource
-      context.context().summary_writer_resource = self._resource
-      yield self
-      # Flushes the summary writer in eager mode or in graph functions, but not
-      # in legacy graph mode (you're on your own there).
-      self.flush()
-      context.context().summary_writer_resource = old
+      try:
+        context.context().summary_writer_resource = self._resource
+        yield self
+        # Flushes the summary writer in eager mode or in graph functions, but
+        # not in legacy graph mode (you're on your own there).
+        with ops.device("cpu:0"):
+          gen_summary_ops.flush_summary_writer(self._resource)
+      finally:
+        context.context().summary_writer_resource = old
+
 
   def init(self):
     """Operation to initialize the summary writer resource."""
-- 
GitLab


From 04eddab6c926890133ca6f3a11a3ac5fd4b7a6d4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 07:35:23 -0700
Subject: [PATCH 405/450] BUGFIX: Correct a typo in fisher_factors.py.

PiperOrigin-RevId: 194238853
---
 tensorflow/contrib/kfac/python/ops/fisher_factors.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/kfac/python/ops/fisher_factors.py b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
index b2da13db89..7988a3b92b 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_factors.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
@@ -86,7 +86,7 @@ def set_global_constants(init_covariances_at_zero=None,
                          eigenvalue_clipping_threshold=None,
                          max_num_outer_products_per_cov_row=None,
                          sub_sample_outer_products=None,
-                         inputs_to_extract_ptaches_factor=None,
+                         inputs_to_extract_patches_factor=None,
                          sub_sample_inputs=None,
                          tower_strategy=None):
   """Sets various global constants used by the classes in this module."""
@@ -112,8 +112,8 @@ def set_global_constants(init_covariances_at_zero=None,
     _MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW = max_num_outer_products_per_cov_row
   if sub_sample_outer_products is not None:
     _SUB_SAMPLE_OUTER_PRODUCTS = sub_sample_outer_products
-  if inputs_to_extract_ptaches_factor is not None:
-    _INPUTS_TO_EXTRACT_PATCHES_FACTOR = inputs_to_extract_ptaches_factor
+  if inputs_to_extract_patches_factor is not None:
+    _INPUTS_TO_EXTRACT_PATCHES_FACTOR = inputs_to_extract_patches_factor
   if sub_sample_inputs is not None:
     _SUB_SAMPLE_INPUTS = sub_sample_inputs
   if tower_strategy is not None:
-- 
GitLab


From dc03c644d40e6d0875c49d95d8e6f1e2db8114de Mon Sep 17 00:00:00 2001
From: James Keeling <jtkeeling@google.com>
Date: Wed, 25 Apr 2018 09:19:57 -0700
Subject: [PATCH 406/450] Update use of squeeze_dims to axis in call to squeeze

This was causing a warning every time a user created a MonitoredSession.

PiperOrigin-RevId: 194248478
---
 tensorflow/python/ops/array_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index aba8beb3f4..1ea1a48c39 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1232,7 +1232,7 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
 
   def _apply_mask_1d(reshaped_tensor, mask, axis=None):
     """Mask tensor along dimension 0 with a 1-D mask."""
-    indices = squeeze(where(mask), squeeze_dims=[1])
+    indices = squeeze(where(mask), axis=[1])
     return gather(reshaped_tensor, indices, axis=axis)
 
   with ops.name_scope(name, values=[tensor, mask]):
-- 
GitLab


From d0e81bce1f8a882691083758d2afd1662ad37463 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Wed, 25 Apr 2018 09:25:25 -0700
Subject: [PATCH 407/450] [XLA:GPU] Set default kernel unrolling factor to 4

NVidia GPUs have 4-wide vector stores, so this is the natural unrolling factor
there. After recent fixes LLVM properly vectorizes the stores, giving a decent
performance win across the board and a huge performance win for F16 kernels.

At the moment the unrolling has no cost model and only applies to loop fusions.
All of the loop fusions I looked at were memory bound, unrolling them is a
clear win. When we add a cost model and/or auto tune later it has to be at
least as good as unrolling all the loop fusions.

PiperOrigin-RevId: 194249951
---
 tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
index 70ae95bf47..bc8405703b 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
@@ -43,7 +43,7 @@ void SetDebugOptionsDefaults(DebugOptions* flags) {
 #ifdef INTEL_MKL
   flags->set_xla_cpu_use_mkl_dnn(true);
 #endif  // INTEL_MKL
-  flags->set_xla_gpu_max_kernel_unroll_factor(1);
+  flags->set_xla_gpu_max_kernel_unroll_factor(4);
   // Set cudnn batchnorm off by default; it does not provide a performance win
   // on average.
   flags->set_xla_gpu_use_cudnn_batchnorm(false);
-- 
GitLab


From 09a39ad1f5d146ea5107cc64dfa0111c1c09424b Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Wed, 25 Apr 2018 09:36:24 -0700
Subject: [PATCH 408/450] Stop using gpu:: as an alias for stream_executor::.

Also do a few related namespace cleanups.

PiperOrigin-RevId: 194252437
---
 .../gpu/gpu_cudamalloc_allocator.cc           |  2 +-
 .../common_runtime/gpu/gpu_debug_allocator.cc | 12 ++--
 .../gpu/gpu_debug_allocator_test.cc           | 22 +++----
 .../core/common_runtime/gpu/gpu_device.cc     | 64 +++++++++----------
 .../core/common_runtime/gpu/gpu_device.h      | 14 ++--
 .../core/common_runtime/gpu/gpu_id_utils.h    | 12 ++--
 tensorflow/core/common_runtime/gpu/gpu_util.h |  4 --
 .../core/common_runtime/gpu/process_state.cc  |  6 +-
 .../core/common_runtime/gpu_device_context.h  | 31 ++++-----
 .../default/from_stream_executor_status.h     |  5 +-
 tensorflow/core/util/stream_executor_util.h   |  7 +-
 .../stream_executor/multi_platform_manager.h  |  2 +-
 12 files changed, 81 insertions(+), 100 deletions(-)

diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
index 08961fc105..934a57a5fb 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
@@ -38,7 +38,7 @@ GPUcudaMallocAllocator::~GPUcudaMallocAllocator() { delete base_allocator_; }
 void* GPUcudaMallocAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
 #ifdef GOOGLE_CUDA
   // allocate with cudaMalloc
-  gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
+  se::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
   CUdeviceptr rv = 0;
   CUresult res = cuMemAlloc(&rv, num_bytes);
   if (res != CUDA_SUCCESS) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index 4ff5fab866..e4c834b30d 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -41,7 +41,7 @@ int64* before_mask = NewMask(0xabababababababab);
 int64* after_mask = NewMask(0xcdcdcdcdcdcdcdcd);
 
 bool CheckMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
-  gpu::DeviceMemory<int64> gpu_ptr{gpu::DeviceMemoryBase{ptr, MASK_BYTES}};
+  se::DeviceMemory<int64> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
   int64 tmp[MASK_WORDS];
 
   if (!exec->SynchronousMemcpy(&tmp, gpu_ptr, MASK_BYTES)) {
@@ -62,7 +62,7 @@ bool CheckMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
 }
 
 void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
-  gpu::DeviceMemory<int64> gpu_ptr{gpu::DeviceMemoryBase{ptr, MASK_BYTES}};
+  se::DeviceMemory<int64> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
   if (!exec->SynchronousMemcpy(&gpu_ptr, mask, MASK_BYTES)) {
     LOG(FATAL) << "Could not copy debug mask";
   }
@@ -174,8 +174,8 @@ void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   size_t req_size = base_allocator_->RequestedSize(allocated_ptr);
   std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
                           std::nanf(""));
-  gpu::DeviceMemory<float> nan_ptr{
-      gpu::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}};
+  se::DeviceMemory<float> nan_ptr{
+      se::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}};
 
   if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
     LOG(ERROR) << "Could not initialize to NaNs";
@@ -189,8 +189,8 @@ void GPUNanResetAllocator::DeallocateRaw(void* ptr) {
     size_t req_size = base_allocator_->RequestedSize(ptr);
     std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
                             std::nanf(""));
-    gpu::DeviceMemory<float> nan_ptr{
-        gpu::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
+    se::DeviceMemory<float> nan_ptr{
+        se::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
     if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
       LOG(ERROR) << "Could not initialize to NaNs";
     }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
index d34f0cb3c2..236a0afa0b 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
@@ -43,7 +43,7 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_None) {
     std::vector<int64> cpu_array(s);
     memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
     int64* gpu_array = a.Allocate<int64>(cpu_array.size());
-    gpu::DeviceMemory<int64> gpu_array_ptr{gpu::DeviceMemoryBase{gpu_array}};
+    se::DeviceMemory<int64> gpu_array_ptr{se::DeviceMemoryBase{gpu_array}};
     ASSERT_TRUE(stream_exec->SynchronousMemcpy(&gpu_array_ptr, &cpu_array[0],
                                                s * sizeof(int64)));
     EXPECT_TRUE(a.CheckHeader(gpu_array));
@@ -68,13 +68,13 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) {
           memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
           int64* gpu_array = a.Allocate<int64>(cpu_array.size());
 
-          gpu::DeviceMemory<int64> gpu_array_ptr{
-              gpu::DeviceMemoryBase{gpu_array}};
+          se::DeviceMemory<int64> gpu_array_ptr{
+              se::DeviceMemoryBase{gpu_array}};
           ASSERT_TRUE(stream_exec->SynchronousMemcpy(
               &gpu_array_ptr, &cpu_array[0], cpu_array.size() * sizeof(int64)));
 
-          gpu::DeviceMemory<int64> gpu_hdr_ptr{
-              gpu::DeviceMemoryBase{gpu_array - 1}};
+          se::DeviceMemory<int64> gpu_hdr_ptr{
+              se::DeviceMemoryBase{gpu_array - 1}};
           // Clobber first word of the header.
           float pi = 3.1417;
           ASSERT_TRUE(
@@ -101,14 +101,14 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
           memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
           int64* gpu_array = a.Allocate<int64>(cpu_array.size());
 
-          gpu::DeviceMemory<int64> gpu_array_ptr{
-              gpu::DeviceMemoryBase{gpu_array}};
+          se::DeviceMemory<int64> gpu_array_ptr{
+              se::DeviceMemoryBase{gpu_array}};
           ASSERT_TRUE(stream_exec->SynchronousMemcpy(
               &gpu_array_ptr, &cpu_array[0], cpu_array.size() * sizeof(int64)));
 
           // Clobber word of the footer.
-          gpu::DeviceMemory<int64> gpu_ftr_ptr{
-              gpu::DeviceMemoryBase{gpu_array + s}};
+          se::DeviceMemory<int64> gpu_ftr_ptr{
+              se::DeviceMemoryBase{gpu_array + s}};
           float pi = 3.1417;
           ASSERT_TRUE(
               stream_exec->SynchronousMemcpy(&gpu_ftr_ptr, &pi, sizeof(float)));
@@ -131,7 +131,7 @@ TEST(GPUDebugAllocatorTest, ResetToNan) {
 
   // Allocate 1024 floats
   float* gpu_array = a.Allocate<float>(cpu_array.size());
-  gpu::DeviceMemory<float> gpu_array_ptr{gpu::DeviceMemoryBase{gpu_array}};
+  se::DeviceMemory<float> gpu_array_ptr{se::DeviceMemoryBase{gpu_array}};
   ASSERT_TRUE(stream_exec->SynchronousMemcpy(&cpu_array[0], gpu_array_ptr,
                                              cpu_array.size() * sizeof(float)));
   for (float f : cpu_array) {
@@ -174,7 +174,7 @@ TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
 
   // Allocate 1024 floats
   float* gpu_array = a.Allocate<float>(cpu_array.size());
-  gpu::DeviceMemory<float> gpu_array_ptr{gpu::DeviceMemoryBase{gpu_array}};
+  se::DeviceMemory<float> gpu_array_ptr{se::DeviceMemoryBase{gpu_array}};
   ASSERT_TRUE(stream_exec->SynchronousMemcpy(&cpu_array[0], gpu_array_ptr,
                                              cpu_array.size() * sizeof(float)));
   for (float f : cpu_array) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index f7248ca79d..4abec7c3d5 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -200,27 +200,27 @@ class BaseGPUDevice::StreamGroupFactory {
   // This function is thread safe.
   BaseGPUDevice::StreamGroup* GetOrCreate(TfGpuId tf_gpu_id,
                                           int stream_group_within_gpu,
-                                          gpu::StreamExecutor* executor) {
+                                          se::StreamExecutor* executor) {
     mutex_lock guard(lock_);
     StreamGroup* group =
         &streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)];
     if (!group->compute) {
-      group->compute = new gpu::Stream(executor);
+      group->compute = new se::Stream(executor);
       group->compute->Init();
       VLOG(2) << "Created stream[" << stream_group_within_gpu
               << "] = " << group->compute;
 
-      group->host_to_device = new gpu::Stream(executor);
+      group->host_to_device = new se::Stream(executor);
       group->host_to_device->Init();
       VLOG(2) << "Created host_to_device_stream[" << stream_group_within_gpu
               << "] = " << group->host_to_device;
 
-      group->device_to_host = new gpu::Stream(executor);
+      group->device_to_host = new se::Stream(executor);
       group->device_to_host->Init();
       VLOG(2) << "Created device_to_host_stream[" << stream_group_within_gpu
               << "] = " << group->device_to_host;
 
-      group->device_to_device = new gpu::Stream(executor);
+      group->device_to_device = new se::Stream(executor);
       group->device_to_device->Init();
       VLOG(2) << "Created device_to_device_stream[" << stream_group_within_gpu
               << "] = " << group->device_to_host;
@@ -440,7 +440,7 @@ void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
     gpu_device_context =
         static_cast<GPUDeviceContext*>(context->op_device_context());
   }
-  gpu::Stream* stream = gpu_device_context->stream();
+  se::Stream* stream = gpu_device_context->stream();
   const auto stream_id = gpu_device_context->stream_id();
 
   const bool vlog_1 = VLOG_IS_ON(1);
@@ -484,7 +484,7 @@ void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
       if (idc->stream() != stream) stream->ThenWaitFor(idc->stream());
     }
   }
-  gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
+  se::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
   op_kernel->Compute(context);
   if (context->status().ok()) {
     if (sync_every_op_) {
@@ -503,7 +503,7 @@ void BaseGPUDevice::ConsumeListOfAccessedTensors(
   if (device_context != nullptr) {
     gpu_device_context = static_cast<GPUDeviceContext*>(device_context);
   }
-  gpu::Stream* stream = gpu_device_context->stream();
+  se::Stream* stream = gpu_device_context->stream();
   em_->ThenDeleteTensors(stream, tensor_refs);
 }
 
@@ -519,7 +519,7 @@ void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
     gpu_device_context =
         static_cast<GPUDeviceContext*>(context->op_device_context());
   }
-  gpu::Stream* stream = gpu_device_context->stream();
+  se::Stream* stream = gpu_device_context->stream();
   const auto stream_id = gpu_device_context->stream_id();
 
   VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op "
@@ -531,7 +531,7 @@ void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
   // false value. Measurements show that its overhead is negligible.
   port::Tracing::TraceMe activity(op_kernel->name(), op_kernel->type_string(),
                                   op_kernel->IsExpensive());
-  gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
+  se::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
   op_kernel->ComputeAsync(context, done);
 }
 
@@ -665,7 +665,7 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
 Status ParseVisibleDeviceList(const string& visible_device_list,
                               std::vector<CudaGpuId>* visible_gpu_order) {
   visible_gpu_order->clear();
-  gpu::Platform* gpu_manager = GPUMachineManager();
+  se::Platform* gpu_manager = GPUMachineManager();
 
   // If the user wants to remap the visible to virtual GPU mapping,
   // check for that here.
@@ -784,7 +784,7 @@ Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options,
                                       int64* memory_limit) {
   int64 total_memory = 0;
   int64 available_memory = 0;
-  gpu::StreamExecutor* se =
+  se::StreamExecutor* se =
       GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
   if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) {
     return errors::Unknown("Failed to query available memory for GPU ",
@@ -858,7 +858,7 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
                                            const string& name_prefix,
                                            std::vector<Device*>* devices) {
   TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
-  gpu::Platform* gpu_manager = GPUMachineManager();
+  se::Platform* gpu_manager = GPUMachineManager();
   if (gpu_manager == nullptr) {
     return Status::OK();
   }
@@ -997,7 +997,7 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
 }
 
 static string GetShortDeviceDescription(CudaGpuId cuda_gpu_id,
-                                        const gpu::DeviceDescription& desc) {
+                                        const se::DeviceDescription& desc) {
   int cc_major;
   int cc_minor;
   if (!desc.cuda_compute_capability(&cc_major, &cc_minor)) {
@@ -1025,9 +1025,9 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
   CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
   int numa_node = dev_locality.numa_node();
 
-  gpu::StreamExecutor* se =
+  se::StreamExecutor* se =
       GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
-  const gpu::DeviceDescription& desc = se->GetDeviceDescription();
+  const se::DeviceDescription& desc = se->GetDeviceDescription();
   ProcessState* process_state = ProcessState::singleton();
   Allocator* gpu_allocator = process_state->GetGPUAllocator(
       options.config.gpu_options(), tf_gpu_id, memory_limit);
@@ -1060,15 +1060,15 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
 
 namespace {
 std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>>
-GetPeerAccessMap(gpu::Platform* platform,
+GetPeerAccessMap(se::Platform* platform,
                  const std::vector<CudaGpuId>& visible_gpu_order) {
   std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>> map(
       new std::map<std::pair<CudaGpuId, CudaGpuId>, bool>);
   for (CudaGpuId cuda_gpu_i : visible_gpu_order) {
     for (CudaGpuId cuda_gpu_j : visible_gpu_order) {
-      gpu::StreamExecutor* from =
+      se::StreamExecutor* from =
           GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie();
-      gpu::StreamExecutor* to =
+      se::StreamExecutor* to =
           GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie();
       (*map)[{cuda_gpu_i, cuda_gpu_j}] = from->CanEnablePeerAccessTo(to);
     }
@@ -1080,7 +1080,7 @@ GetPeerAccessMap(gpu::Platform* platform,
 }  // namespace
 
 Status BaseGPUDeviceFactory::GetInterconnectMaps(
-    const std::vector<CudaGpuId>& visible_gpu_order, gpu::Platform* gpu_manager,
+    const std::vector<CudaGpuId>& visible_gpu_order, se::Platform* gpu_manager,
     std::vector<InterconnectMap>* maps) {
   // The default interconnect map is obtained from the StreamExecutor.
   auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order);
@@ -1111,9 +1111,9 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
     // Get GPU bus_id from its reported NUMA affinity.  Because GPUs are
     // virtualized in some environments, we can't just use the GPU id.
     // NUMA locales are indexed from 0, buses are indexed from 1.
-    gpu::StreamExecutor* se =
+    se::StreamExecutor* se =
         GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
-    const gpu::DeviceDescription& desc = se->GetDeviceDescription();
+    const se::DeviceDescription& desc = se->GetDeviceDescription();
     int numa_node = desc.numa_node();
     if (numa_node < 0) {
       // For some reason the StreamExecutor couldn't get the NUMA
@@ -1169,7 +1169,7 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
 }
 
 static int GetDefaultMinGPUMultiprocessorCount(
-    gpu::Platform* gpu_manager,
+    se::Platform* gpu_manager,
     const std::vector<CudaGpuId>& visible_gpu_order) {
   static const int kDefaultMinGPUMultiprocessorCount = 8;
 
@@ -1182,8 +1182,8 @@ static int GetDefaultMinGPUMultiprocessorCount(
       continue;
     }
 
-    gpu::StreamExecutor* se = exec_status.ValueOrDie();
-    const gpu::DeviceDescription& desc = se->GetDeviceDescription();
+    se::StreamExecutor* se = exec_status.ValueOrDie();
+    const se::DeviceDescription& desc = se->GetDeviceDescription();
     max_count = std::max(max_count, desc.core_count());
   }
 
@@ -1195,7 +1195,7 @@ static int GetDefaultMinGPUMultiprocessorCount(
 }
 
 static int GetMinGPUMultiprocessorCount(
-    gpu::Platform* gpu_manager,
+    se::Platform* gpu_manager,
     const std::vector<CudaGpuId>& visible_gpu_order) {
   const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT");
 
@@ -1273,7 +1273,7 @@ std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() {
   return cuda_caps;
 }
 
-Status EnablePeerAccess(gpu::Platform* platform,
+Status EnablePeerAccess(se::Platform* platform,
                         const std::vector<CudaGpuId>& visible_gpu_order) {
   int possible_peer_count = 0;
   int enabled_peer_count = 0;
@@ -1282,9 +1282,9 @@ Status EnablePeerAccess(gpu::Platform* platform,
     for (int j = 0; j < visible_gpu_order.size(); ++j) {
       const CudaGpuId cuda_gpu_j = visible_gpu_order[j];
       // We have already validated that ExecutorForDevice() calls return OK.
-      gpu::StreamExecutor* from =
+      se::StreamExecutor* from =
           GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie();
-      gpu::StreamExecutor* to =
+      se::StreamExecutor* to =
           GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie();
 
       if (from->CanEnablePeerAccessTo(to)) {
@@ -1318,7 +1318,7 @@ Status EnablePeerAccess(gpu::Platform* platform,
 Status BaseGPUDeviceFactory::GetValidDeviceIds(
     const std::vector<CudaGpuId>& visible_gpu_order,
     std::vector<CudaGpuId>* ids) {
-  gpu::Platform* gpu_manager = GPUMachineManager();
+  se::Platform* gpu_manager = GPUMachineManager();
   bool new_gpu_found = false;
   for (int i = 0; i < visible_gpu_order.size(); ++i) {
     const CudaGpuId cuda_gpu_id = visible_gpu_order[i];
@@ -1388,8 +1388,8 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
                 << exec_status.status().ToString();
       continue;
     }
-    gpu::StreamExecutor* se = exec_status.ValueOrDie();
-    const gpu::DeviceDescription& desc = se->GetDeviceDescription();
+    se::StreamExecutor* se = exec_status.ValueOrDie();
+    const se::DeviceDescription& desc = se->GetDeviceDescription();
     CudaVersion device_capability;
     if (!desc.cuda_compute_capability(&device_capability.major_part,
                                       &device_capability.minor_part)) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index cc5c3881dd..b754ffd2db 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -94,7 +94,7 @@ class BaseGPUDevice : public LocalDevice {
 
   // The executor that provides control for the device; e.g., for CUDA this
   // corresponds to the cuda context.
-  gpu::StreamExecutor* executor() const { return executor_; }
+  se::StreamExecutor* executor() const { return executor_; }
 
   Allocator* GetScopedAllocator(AllocatorAttributes attr,
                                 int64 step_id) override;
@@ -107,15 +107,15 @@ class BaseGPUDevice : public LocalDevice {
   Allocator* gpu_allocator_;  // not owned
   Allocator* cpu_allocator_;  // not owned
 
-  gpu::StreamExecutor* executor_;  // not owned
+  se::StreamExecutor* executor_;  // not owned
   std::unique_ptr<ScopedAllocatorMgr> scoped_allocator_mgr_;
 
  private:
   struct StreamGroup {
-    gpu::Stream* compute = nullptr;
-    gpu::Stream* host_to_device = nullptr;
-    gpu::Stream* device_to_host = nullptr;
-    gpu::Stream* device_to_device = nullptr;
+    se::Stream* compute = nullptr;
+    se::Stream* host_to_device = nullptr;
+    se::Stream* device_to_host = nullptr;
+    se::Stream* device_to_device = nullptr;
   };
   class StreamGroupFactory;
 
@@ -168,7 +168,7 @@ class BaseGPUDeviceFactory : public DeviceFactory {
   // pathways between GPUs.
   virtual Status GetInterconnectMaps(
       const std::vector<CudaGpuId>& visible_gpu_order,
-      gpu::Platform* gpu_manager, std::vector<InterconnectMap>* maps);
+      se::Platform* gpu_manager, std::vector<InterconnectMap>* maps);
 
   struct TfGpuIdHash {
     std::size_t operator()(const TfGpuId& id) const noexcept {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
index 5c503d1261..42bf074e63 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
@@ -24,24 +24,20 @@ limitations under the License.
 
 namespace tensorflow {
 
-// TODO(b/77980417): Remove this and use the regular tensorflow::se alias once
-// that's available.
-namespace gpu = ::stream_executor;
-
 // Utility methods for translation between Tensorflow GPU ids and CUDA GPU ids.
 class GpuIdUtil {
  public:
   // Convenient methods for getting the associated executor given a TfGpuId or
   // CudaGpuId.
-  static gpu::port::StatusOr<gpu::StreamExecutor*> ExecutorForCudaGpuId(
-      gpu::Platform* gpu_manager, CudaGpuId cuda_gpu_id) {
+  static se::port::StatusOr<se::StreamExecutor*> ExecutorForCudaGpuId(
+      se::Platform* gpu_manager, CudaGpuId cuda_gpu_id) {
     return gpu_manager->ExecutorForDevice(cuda_gpu_id.value());
   }
-  static gpu::port::StatusOr<gpu::StreamExecutor*> ExecutorForCudaGpuId(
+  static se::port::StatusOr<se::StreamExecutor*> ExecutorForCudaGpuId(
       CudaGpuId cuda_gpu_id) {
     return ExecutorForCudaGpuId(GPUMachineManager(), cuda_gpu_id);
   }
-  static gpu::port::StatusOr<gpu::StreamExecutor*> ExecutorForTfGpuId(
+  static se::port::StatusOr<se::StreamExecutor*> ExecutorForTfGpuId(
       TfGpuId tf_gpu_id) {
     return ExecutorForCudaGpuId(GpuIdManager::TfToCudaGpuId(tf_gpu_id));
   }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.h b/tensorflow/core/common_runtime/gpu/gpu_util.h
index 0c69a17eaa..237b0044da 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.h
@@ -27,10 +27,6 @@ namespace tensorflow {
 class RecvTensorResponse;
 class TensorProto;
 
-// TODO(b/77980417): Remove this and use the regular tensorflow::se alias once
-// that's available.
-namespace gpu = ::stream_executor;
-
 class GPUUtil {
  public:
   // "tensor" is GPU-local.  "dev" is the hosting GPU.
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index 866a03d046..5ed01278c1 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -146,7 +146,7 @@ Allocator* ProcessState::GetGPUAllocator(const GPUOptions& options,
 
     // If there are any pending AllocVisitors for this bus, add
     // them now.
-    gpu::StreamExecutor* se =
+    se::StreamExecutor* se =
         GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie();
     int bus_id = se->GetDeviceDescription().numa_node();
     if (bus_id >= 0 && bus_id < static_cast<int64>(gpu_visitors_.size())) {
@@ -257,7 +257,7 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
   // better source of information about which executor to use.  For
   // example, process_state could maybe save the first stream executor
   // it knows is valid.
-  gpu::StreamExecutor* se = nullptr;
+  se::StreamExecutor* se = nullptr;
   for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
     if (gpu_allocators_[i] != nullptr) {
       se = GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie();
@@ -305,7 +305,7 @@ void ProcessState::AddGPUAllocVisitor(int bus_id, AllocVisitor visitor) {
 #if GOOGLE_CUDA
   mutex_lock lock(mu_);
   for (int i = 0; i < static_cast<int64>(gpu_allocators_.size()); ++i) {
-    gpu::StreamExecutor* se =
+    se::StreamExecutor* se =
         GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie();
     if (gpu_allocators_[i] &&
         (se->GetDeviceDescription().numa_node() + 1) == bus_id) {
diff --git a/tensorflow/core/common_runtime/gpu_device_context.h b/tensorflow/core/common_runtime/gpu_device_context.h
index a1ad2c2277..c92c5d1af3 100644
--- a/tensorflow/core/common_runtime/gpu_device_context.h
+++ b/tensorflow/core/common_runtime/gpu_device_context.h
@@ -25,16 +25,13 @@ class Stream;
 
 namespace tensorflow {
 
-// TODO(b/77980417): Replace stream_executor:: with se:: once our namespace
-// migration is complete and the alias is available.
-
 class GPUDeviceContext : public DeviceContext {
  public:
   // Does not take ownership of streams.
-  GPUDeviceContext(int stream_id, stream_executor::Stream* stream,
-                   stream_executor::Stream* host_to_device_stream,
-                   stream_executor::Stream* device_to_host_stream,
-                   stream_executor::Stream* device_to_device_stream)
+  GPUDeviceContext(int stream_id, se::Stream* stream,
+                   se::Stream* host_to_device_stream,
+                   se::Stream* device_to_host_stream,
+                   se::Stream* device_to_device_stream)
       : stream_id_(stream_id),
         stream_(stream),
         host_to_device_stream_(host_to_device_stream),
@@ -43,14 +40,10 @@ class GPUDeviceContext : public DeviceContext {
 
   ~GPUDeviceContext() override {}
 
-  stream_executor::Stream* stream() const override { return stream_; }
-  stream_executor::Stream* host_to_device_stream() const {
-    return host_to_device_stream_;
-  }
-  stream_executor::Stream* device_to_host_stream() const {
-    return device_to_host_stream_;
-  }
-  stream_executor::Stream* device_to_device_stream() const {
+  se::Stream* stream() const override { return stream_; }
+  se::Stream* host_to_device_stream() const { return host_to_device_stream_; }
+  se::Stream* device_to_host_stream() const { return device_to_host_stream_; }
+  se::Stream* device_to_device_stream() const {
     return device_to_device_stream_;
   }
   int stream_id() const { return stream_id_; }
@@ -70,13 +63,13 @@ class GPUDeviceContext : public DeviceContext {
   int stream_id_;
   // The default primary stream to use for this context.
   // All the memory belongs to this stream.
-  stream_executor::Stream* stream_;
+  se::Stream* stream_;
   // The stream to use for copy data from host into GPU.
-  stream_executor::Stream* host_to_device_stream_;
+  se::Stream* host_to_device_stream_;
   // The stream to use for copy data from GPU to host.
-  stream_executor::Stream* device_to_host_stream_;
+  se::Stream* device_to_host_stream_;
   // The stream to use for copy data between GPU.
-  stream_executor::Stream* device_to_device_stream_;
+  se::Stream* device_to_device_stream_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/from_stream_executor_status.h b/tensorflow/core/platform/default/from_stream_executor_status.h
index 36a67a3648..93d2f28ca8 100644
--- a/tensorflow/core/platform/default/from_stream_executor_status.h
+++ b/tensorflow/core/platform/default/from_stream_executor_status.h
@@ -25,10 +25,7 @@ namespace tensorflow {
 
 // On the open-source platform, stream_executor currently uses
 // tensorflow::Status
-inline Status FromStreamExecutorStatus(
-    const perftools::gputools::port::Status& s) {
-  return s;
-}
+inline Status FromStreamExecutorStatus(const se::port::Status& s) { return s; }
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/stream_executor_util.h b/tensorflow/core/util/stream_executor_util.h
index f7767ace71..7d715de499 100644
--- a/tensorflow/core/util/stream_executor_util.h
+++ b/tensorflow/core/util/stream_executor_util.h
@@ -30,17 +30,16 @@ class StreamExecutorUtil {
   // Map a Tensor as a DeviceMemory object wrapping the given typed
   // buffer.
   template <typename T>
-  static perftools::gputools::DeviceMemory<T> AsDeviceMemory(const Tensor& t) {
+  static se::DeviceMemory<T> AsDeviceMemory(const Tensor& t) {
     T* ptr = reinterpret_cast<T*>(const_cast<char*>(t.tensor_data().data()));
-    return perftools::gputools::DeviceMemory<T>(
-        perftools::gputools::DeviceMemoryBase(ptr, t.TotalBytes()));
+    return se::DeviceMemory<T>(se::DeviceMemoryBase(ptr, t.TotalBytes()));
   }
 
   // Converts from a StreamExecutor Status to a TensorFlow Status.
   //
   // This assumes that the error codes between the two implementations
   // match.
-  static Status ConvertStatus(const perftools::gputools::port::Status& s) {
+  static Status ConvertStatus(const se::port::Status& s) {
     return s.ok() ? Status::OK()
                   : Status(static_cast<tensorflow::error::Code>(
                                static_cast<int>(s.code())),
diff --git a/tensorflow/stream_executor/multi_platform_manager.h b/tensorflow/stream_executor/multi_platform_manager.h
index 672855d5fb..7e316879ca 100644
--- a/tensorflow/stream_executor/multi_platform_manager.h
+++ b/tensorflow/stream_executor/multi_platform_manager.h
@@ -29,7 +29,7 @@ limitations under the License.
 // interface. Sample API usage:
 //
 //   port::StatusOr<Platform*> platform_status =
-//      gpu::MultiPlatformManager::PlatformWithName("OpenCL");
+//      se::MultiPlatformManager::PlatformWithName("OpenCL");
 //   if (!platform_status.ok()) { ... }
 //   Platform* platform = platform_status.ValueOrDie();
 //   LOG(INFO) << platform->VisibleDeviceCount() << " devices visible";
-- 
GitLab


From 63f4618fbdd653fd19a3663a64da89c476aeb0cd Mon Sep 17 00:00:00 2001
From: Junpeng Lao <junpeng.lao@unifr.ch>
Date: Wed, 25 Apr 2018 19:01:38 +0200
Subject: [PATCH 409/450] fix Non-ASCII character error

---
 .../contrib/distributions/python/ops/bijectors/ordered.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
index eb1eebd4ca..3f03592f31 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
@@ -106,7 +106,7 @@ class Ordered(bijector.Bijector):
     #          exp(y_i) if 1<i<=K
     # which gives the absolute Jacobian determinant:
     # |det(Jac)| = prod_{i=1}^{K} exp(y[i]).
-    # (1) - Stan Modeling Language User’s Guide and Reference Manual
+    # (1) - Stan Modeling Language User's Guide and Reference Manual
     #       Version 2.17.0 session 35.2
     return math_ops.reduce_sum(y[..., 1:], axis=-1)
 
@@ -122,4 +122,4 @@ class Ordered(bijector.Bijector):
     is_valid = check_ops.assert_positive(
         x[..., 1:] - x[..., :-1],
         message="Forward transformation input must be strictly increasing.")
-    return control_flow_ops.with_dependencies([is_valid], x)
\ No newline at end of file
+    return control_flow_ops.with_dependencies([is_valid], x)
-- 
GitLab


From ac7636d72cadeac05509450fd50269ed66ee41a6 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Wed, 25 Apr 2018 10:09:07 -0700
Subject: [PATCH 410/450] Test more types in ternary_ops_test.

PiperOrigin-RevId: 194256929
---
 tensorflow/compiler/tests/ternary_ops_test.py | 61 ++++++++++---------
 1 file changed, 31 insertions(+), 30 deletions(-)

diff --git a/tensorflow/compiler/tests/ternary_ops_test.py b/tensorflow/compiler/tests/ternary_ops_test.py
index 75a2cf07c5..ef047005b6 100644
--- a/tensorflow/compiler/tests/ternary_ops_test.py
+++ b/tensorflow/compiler/tests/ternary_ops_test.py
@@ -69,40 +69,41 @@ class TernaryOpsTest(XLATestCase):
         expected=np.array([1, 3, 5], dtype=np.int32))
 
   def testSelect(self):
-    self._testTernary(
-        array_ops.where,
-        np.array(0, dtype=np.bool),
-        np.array(2, dtype=np.float32),
-        np.array(7, dtype=np.float32),
-        expected=np.array(7, dtype=np.float32))
+    for dtype in self.numeric_types:
+      self._testTernary(
+          array_ops.where,
+          np.array(0, dtype=np.bool),
+          np.array(2, dtype=dtype),
+          np.array(7, dtype=dtype),
+          expected=np.array(7, dtype=dtype))
 
-    self._testTernary(
-        array_ops.where,
-        np.array(1, dtype=np.bool),
-        np.array([1, 2, 3, 4], dtype=np.float32),
-        np.array([5, 6, 7, 8], dtype=np.float32),
-        expected=np.array([1, 2, 3, 4], dtype=np.float32))
+      self._testTernary(
+          array_ops.where,
+          np.array(1, dtype=np.bool),
+          np.array([1, 2, 3, 4], dtype=dtype),
+          np.array([5, 6, 7, 8], dtype=dtype),
+          expected=np.array([1, 2, 3, 4], dtype=dtype))
 
-    self._testTernary(
-        array_ops.where,
-        np.array(0, dtype=np.bool),
-        np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32),
-        np.array([[7, 8], [9, 10], [11, 12]], dtype=np.float32),
-        expected=np.array([[7, 8], [9, 10], [11, 12]], dtype=np.float32))
+      self._testTernary(
+          array_ops.where,
+          np.array(0, dtype=np.bool),
+          np.array([[1, 2], [3, 4], [5, 6]], dtype=dtype),
+          np.array([[7, 8], [9, 10], [11, 12]], dtype=dtype),
+          expected=np.array([[7, 8], [9, 10], [11, 12]], dtype=dtype))
 
-    self._testTernary(
-        array_ops.where,
-        np.array([0, 1, 1, 0], dtype=np.bool),
-        np.array([1, 2, 3, 4], dtype=np.float32),
-        np.array([5, 6, 7, 8], dtype=np.float32),
-        expected=np.array([5, 2, 3, 8], dtype=np.float32))
+      self._testTernary(
+          array_ops.where,
+          np.array([0, 1, 1, 0], dtype=np.bool),
+          np.array([1, 2, 3, 4], dtype=dtype),
+          np.array([5, 6, 7, 8], dtype=dtype),
+          expected=np.array([5, 2, 3, 8], dtype=dtype))
 
-    self._testTernary(
-        array_ops.where,
-        np.array([0, 1, 0], dtype=np.bool),
-        np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32),
-        np.array([[7, 8], [9, 10], [11, 12]], dtype=np.float32),
-        expected=np.array([[7, 8], [3, 4], [11, 12]], dtype=np.float32))
+      self._testTernary(
+          array_ops.where,
+          np.array([0, 1, 0], dtype=np.bool),
+          np.array([[1, 2], [3, 4], [5, 6]], dtype=dtype),
+          np.array([[7, 8], [9, 10], [11, 12]], dtype=dtype),
+          expected=np.array([[7, 8], [3, 4], [11, 12]], dtype=dtype))
 
   def testSlice(self):
     for dtype in self.numeric_types:
-- 
GitLab


From 169f88708959de64374ff4c5e7728d9b75d95b4d Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Wed, 25 Apr 2018 10:21:14 -0700
Subject: [PATCH 411/450] Improve look of TensorFlow Lite Demo App

- Include logo.
- Fix icon to be tensorflow logo
- Highlight first item as bigger.
- Grey out not very good matches.

PiperOrigin-RevId: 194259027
---
 .../Camera2BasicFragment.java                 |  16 +++-
 .../tflitecamerademo/ImageClassifier.java     |  48 +++++++---
 .../main/res/drawable-hdpi/ic_launcher.png    | Bin 3136 -> 3696 bytes
 .../main/res/drawable-mdpi/ic_launcher.png    | Bin 1915 -> 1847 bytes
 .../main/res/drawable-xhdpi/ic_launcher.png   | Bin 4294 -> 5666 bytes
 .../main/res/drawable-xxhdpi/ic_launcher.png  | Bin 7279 -> 10264 bytes
 .../app/src/main/res/drawable-xxhdpi/logo.png | Bin 0 -> 23476 bytes
 .../layout-land/fragment_camera2_basic.xml    |  59 +++++++-----
 .../res/layout-v26/fragment_camera2_basic.xml |  88 ++++++++++++++++++
 .../res/layout/fragment_camera2_basic.xml     |  45 +++++----
 10 files changed, 198 insertions(+), 58 deletions(-)
 create mode 100644 tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/logo.png
 create mode 100644 tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml

diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
index 18f6465188..4f5662bc2d 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
@@ -47,6 +47,8 @@ import android.os.HandlerThread;
 import android.support.annotation.NonNull;
 import android.support.v13.app.FragmentCompat;
 import android.support.v4.content.ContextCompat;
+import android.text.SpannableString;
+import android.text.SpannableStringBuilder;
 import android.util.Log;
 import android.util.Size;
 import android.view.LayoutInflater;
@@ -207,14 +209,21 @@ public class Camera2BasicFragment extends Fragment
    *
    * @param text The message to show
    */
-  private void showToast(final String text) {
+  private void showToast(String s) {
+    SpannableStringBuilder builder = new SpannableStringBuilder();
+    SpannableString str1 = new SpannableString(s);
+    builder.append(str1);
+    showToast(builder);
+  }
+
+  private void showToast(SpannableStringBuilder builder) {
     final Activity activity = getActivity();
     if (activity != null) {
       activity.runOnUiThread(
           new Runnable() {
             @Override
             public void run() {
-              textView.setText(text);
+              textView.setText(builder, TextView.BufferType.SPANNABLE);
             }
           });
     }
@@ -682,8 +691,9 @@ public class Camera2BasicFragment extends Fragment
       showToast("Uninitialized Classifier or invalid context.");
       return;
     }
+    SpannableStringBuilder textToShow = new SpannableStringBuilder();
     Bitmap bitmap = textureView.getBitmap(classifier.getImageSizeX(), classifier.getImageSizeY());
-    String textToShow = classifier.classifyFrame(bitmap);
+    classifier.classifyFrame(bitmap, textToShow);
     bitmap.recycle();
     showToast(textToShow);
   }
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
index d32c077910..7bb6afd9d8 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
@@ -19,10 +19,11 @@ import android.app.Activity;
 import android.content.res.AssetFileDescriptor;
 import android.graphics.Bitmap;
 import android.os.SystemClock;
+import android.text.SpannableString;
+import android.text.SpannableStringBuilder;
+import android.text.style.ForegroundColorSpan;
+import android.text.style.RelativeSizeSpan;
 import android.util.Log;
-
-import org.tensorflow.lite.Interpreter;
-
 import java.io.BufferedReader;
 import java.io.FileInputStream;
 import java.io.IOException;
@@ -37,11 +38,15 @@ import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
 import java.util.PriorityQueue;
+import org.tensorflow.lite.Interpreter;
 
 /**
  * Classifies images with Tensorflow Lite.
  */
 public abstract class ImageClassifier {
+  // Display preferences
+  private static final float GOOD_PROB_THRESHOLD = 0.3f;
+  private static final int SMALL_COLOR = 0xffddaa88;
 
   /** Tag for the {@link Log}. */
   private static final String TAG = "TfLiteCameraDemo";
@@ -99,10 +104,12 @@ public abstract class ImageClassifier {
   }
 
   /** Classifies a frame from the preview stream. */
-  String classifyFrame(Bitmap bitmap) {
+  void classifyFrame(Bitmap bitmap, SpannableStringBuilder builder) {
+    printTopKLabels(builder);
+
     if (tflite == null) {
       Log.e(TAG, "Image classifier has not been initialized; Skipped.");
-      return "Uninitialized Classifier.";
+      builder.append(new SpannableString("Uninitialized Classifier."));
     }
     convertBitmapToByteBuffer(bitmap);
     // Here's where the magic happens!!!
@@ -115,9 +122,10 @@ public abstract class ImageClassifier {
     applyFilter();
 
     // Print the results.
-    String textToShow = printTopKLabels();
-    textToShow = Long.toString(endTime - startTime) + "ms" + textToShow;
-    return textToShow;
+    long duration = endTime - startTime;
+    SpannableString span = new SpannableString(duration + " ms");
+    span.setSpan(new ForegroundColorSpan(android.graphics.Color.LTGRAY), 0, span.length(), 0);
+    builder.append(span);
   }
 
   void applyFilter() {
@@ -202,7 +210,7 @@ public abstract class ImageClassifier {
   }
 
   /** Prints top-K labels, to be shown in UI as the results. */
-  private String printTopKLabels() {
+  private void printTopKLabels(SpannableStringBuilder builder) {
     for (int i = 0; i < getNumLabels(); ++i) {
       sortedLabels.add(
           new AbstractMap.SimpleEntry<>(labelList.get(i), getNormalizedProbability(i)));
@@ -210,13 +218,27 @@ public abstract class ImageClassifier {
         sortedLabels.poll();
       }
     }
-    String textToShow = "";
+
     final int size = sortedLabels.size();
-    for (int i = 0; i < size; ++i) {
+    for (int i = 0; i < size; i++) {
       Map.Entry<String, Float> label = sortedLabels.poll();
-      textToShow = String.format("\n%s: %4.2f", label.getKey(), label.getValue()) + textToShow;
+      SpannableString span =
+          new SpannableString(String.format("%s: %4.2f\n", label.getKey(), label.getValue()));
+      int color;
+      // Make it white when probability larger than threshold.
+      if (label.getValue() > GOOD_PROB_THRESHOLD) {
+        color = android.graphics.Color.WHITE;
+      } else {
+        color = SMALL_COLOR;
+      }
+      // Make first item bigger.
+      if (i == size - 1) {
+        float sizeScale = (i == size - 1) ? 1.25f : 0.8f;
+        span.setSpan(new RelativeSizeSpan(sizeScale), 0, span.length(), 0);
+      }
+      span.setSpan(new ForegroundColorSpan(color), 0, span.length(), 0);
+      builder.insert(0, span);
     }
-    return textToShow;
   }
 
   /**
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png
index c22509d8dfccae14d9470e3042a9ed5b469ca2c9..52cf2ab95296d675dd42533bb9136707adebd98c 100644
GIT binary patch
literal 3696
zcmeAS@N?(olHy`uVBq!ia0y~yVDJE84mJh`hS0a0-5D78%spKkLn>~)ox>Rs@>T4>
z*>7*Z-kQI1$HFEF4TFwNTszd(xn$0qIx8*d;iR;sGt@L^8YnSEc`3bXRcg~YtfaNb
zj`j4s*Ov>tugtJ6o<6hs{r^41?_)ToAGAK9ee`Gj{hxoH&Hi}LdjJ2M|DMlT=qdbr
zlRgMd5@)#6|A{5$?)6BUT^VLKW$T)r?21=YRW>^yzm)OEy>1Qug?Gx=H}2$rDmdlY
zA^rH)?xXA1s9C*#tg?Rg#3xl7RIYz!I{95&@O%F}{{6Wh!yRnzyl~V^kaNvc{9~0r
zV@LXjphcf=`WKn*U;J!_K+g&e{pM4jt(?ogU3a|czWL#!viv{w=NH&qm|^-XzCdZC
z{u_DI`68`PPBG|6DNnHBJ$SvrDP^K#<%boKY(Wp|S-$ba@jeYc?wDHW{xs^lsh9yj
zQ;6-^|I^;7%gqpQ?TSCC^3!g)=R9>~<{!dm&blx@%~RWzH{H|meC?xkjT;Xw^mP8P
z>#RtC@B2LV8q52fd~SPm-*Lvfb~Rn(td_`bT*>XHJJD{npod>L-@&j6hbH)Z@xT1=
zMN`58?b(`Xx4+~ues17dmb6t_rbk9)T3?qU>p^zUZL67%O}^ul{fbRBR{8o|$II64
zWxSs4hmDsVSUI&W!Q$T^S=o;wy_G!Y(k%{rJlgoE@q7BDMR7Y-mV1l7+jErhy7ZZ^
zY0D@65zd+--F>UX@6Pvk<pn9%vJ33aaz3~ud};HY%(`caD|E8h(l6B<{nWpF@}tI^
zKUCK#$nfp&Sn=(2_oNq9C;qQuQa$VxaC4oBCXb#te^2~@|4nJ-Hd~L%KAfLoA+B%7
z7y2}}RrvHhwd(tK+)8ZqA`f^>^qkv2@!*rn<{7t{Tbddbo|PQGe)me#Bp?4NTTO1S
z<&v$mlbI#N8GhQTx$e7h*U9_e%QJdQ9@l@Ge!A(%-u$(Wy(<s2ADOUL@QQUjYe1dC
zho6ZDmVIS>{OEdPtg#SFY<=YH93y+r=;AvTlO7kginPCaka5PP#!XYV;*<WxW%h^7
z)N-bKn+8}J&3aNX?O}81Bpd#9)8+>*w`Ao|(-+xsR`GyNF^knbNgl)XOkWpT2VAy2
zxFt^_D>VPln(e%PF~M^)f?Tfu7169r?*4Fp%flkRZ~xECpAzz))nflye!jl>ysZCr
ze^k^mSaikfONdqCTz-z9%m=?X`zK7ZW}bXOPGT+dg~GR~4!;WZ7Syj74wx8w(Bizf
z&uizv)4x`fKQhv&+<fN8FaDm2_Jt)bLNO;^aXQ%EUU0KwtLDMhwNe{vex=+n4ftg2
zF36Y5xpSM#@;n6*r5W;n8~@&8o}rR1>9aM4Ib+RcSG65Y4~6w>>^aWZFSz-X{Wh1^
z1oOb_dz(a9QoDDz`!_2a&Xs2J3ASXkp8r^9%2^3zG5$DbE+)RyE9-eB^h2MiSl?L`
zV35ztkhj*{;N;wf$V<PZEW#K=PcK=QFypH14HlErZx^m;txIs86v^AA$lAzW8T-S0
z`|EU-;;KN6`Ir3fh;C!}Jag`hSK+5L|IbedaZ&Ga`{{Tfy|p28vi^qqrL1cE&hjRN
zFb1`Bm9SR@8Wh}TZD{!C9rAkqg*C3R@e}VqNqkcGrhdsjsm~pAcKuJ!n-KBq9Z%2|
zYv!#_*Q{9-v}*mWDF!L8nH*2J8O*M2@O_&n+O@VJW#XU9BK}`u{~nyu_WdO1MDxS9
z)Hq-4KNG#{vD#ObvdHR*(bxB%Dsu=go@X@Q=S$0c`?(75B)9SXJsQb#drKGd)2;kV
zKD!$<CY*c0dFR)G*|R2<X8QLSCbBRJvN*nK30WIB|Mz0EudH?62KTP-cU$Z-cZxUb
z0?S{`nv3mcZRNJ}*52dpopejU>o!lvec7AgI+e<j-rV8>Jj&@Sf=+~F`@Ilu$%%hZ
z|BCreTf?jqOX8x0)XoJi*<Uwx>9NvBUz6u_=x8Z;CiT2yz4N`c;l5Z)=dvYxUtBS+
zVx0Y`^l<HtuM2z5HUzxg#wA&icj~hn`<v9+^G~{(Uh_<}KJus`mnoQg!ZulV^*x1?
zr0)ORQ`ckM-M}*2q4zyciO#IM(*z}W{;a*XJg%YVbKAeXIpPV=ira-JU2eXkb#?om
z_r31Tj4KW;G;hh7t-7w&BRe|wVVTP@!`svRuGU{#9b$j&>7=?6c0c=Ni&sd6w5Urw
z+i`WL;8zxrv?a$ZyF*&E6NT9Y74rpV^{y{fkGUYd`GMd8m9iDaMmL!{I<xC0-jw-g
zIOj}Z!({vH7e_iGuPn(+{dG~6m&41*`r~^Rm+Nl?)Y3ziNZ)!du;po;gi`etm8zsV
zjt@HzH~jpTY9qI0f33k*Q;ly+yYDPhoH+f{;w6)gDwkY*mX>p|jp6a5jfPiL&aVGe
zE)y9!J5OGqwz^@J@(~e!gPMoDcV>lOm~yD`{5$@bDD#Byl5K`oE7_0aNQrPt2KH<I
zoDq^-b!p1fH%{wkm>o0t|F!o@`@QJ`Q%-8Fh-N(W`@zB=Ud;@v%4)70mttx9>BIF*
z&Rh01BST_R((#&ZKbw7fwH&U?Pk5>Pd-lfLlbL%?+s$#TwNr4mzxQ<&Pu7-4^Ix&3
zxO$j8yfM8ZZRhJ$z1l0w+L)JgB$_=F<dn^27QQc5JH=W;ys}wwmGy;_8M8k6nKABK
zcr13`X5G0~<qd`ZvM*kFBG($a*!tB)`<dA?KTR7{40KiRAKUCLt67%X&tz7cVzRyZ
zV#9v%3qmZDA9T$$Y?}GFb!qpzPo^(^DE^51%Mi={s_@(O=C!=HI1}4eIG*O2TzmTC
zuSv?{UxaNt3j3rtF5%T>sq%cSmDc*(F7NSze3rXCiFan!=Bi%U#{4VEBQ%yTfK{QX
zuG7FfR)8nV*W}Pao$cRD^H@FllBC=H8aK^29xB%>XFUJc&EC%c307YV1Jn*#uI;~Y
z;aH?zTAHVHn`fEz*)^Z;AIX*eY*)8!>zYQf1KX~*@B}h1EjSf*+Hwu^rW@w7UmUu)
zQUCK7rFf^apC;E{|2p9{%aT%onJs10eIgG(H%#}A(qMnKOv}NOb<gu3i+T_GyxD!R
z&A(|=Z~Ts_8}3)8&NR9|pI_8^x2cEg#SIBhZ0<BN|Jpcror1>auM^g>UfC4Y>Uu}B
z&+|pWwA&wE2_}~mEUi0qI&A9&tp!VSm!8ll%A6PK6nMkTLd=@!+?t|;QQt3TMOki*
zx^=eq^<t+Pp~;7T?v1j4xpj)3Nl4-w)_)b548InI*yzk#Q+2*|-P~7;ZwaMEy0h|F
z*{x-6S1axG=Dl_P{jX2KdGGlru6VZlX^Wq(g62V=Hyy{BY*vZzDMws+wDH0Dgma~V
zcYZ(9vREdoXD+vXUX7Qpwmak1Ez?Y5Dz`3kIQH*im+l>dTL$Sb82?t!UH#?#BXPO4
z3|=o9>X@8UW4PO5AJ!dOGTT!AwDP>5w<mTaSL**&@!h7|aN~+x!Z}%{q{I_n&pLW*
zFIRZ5=Er+a&YGjs_IdgKd}zJc*lTX>wVsuBlg-L!)kP&_UX@s`_4rH~kMR49l*uuD
z1uyfD&d~l=dGP2aaS`E)0~0o?2spptGi|sLIxpeerYkd#-g$k#=Bns{SC6Wv+i#d!
z9AWe0p_9e-yU+Ds2dsEGY12_jzBkuY9os4s=SzzJSoicAZ;Z~H$UFVFm*%RM2a0Z;
zE;r-uJb?ml2hPI>rbzM~;?wbb@+(5O=x#vpxyu(c(mfjEmj!%!W-YYDdez)BOKNW2
z*^)Jd`^ovTkg^Z|UY`x-RmfV!wquH&m_zB|osNq)l&6VpYSO;h%b2<Bu;#k*5dMAF
zcD`6<*%IaO>$`!}xffg|Tdb#f)e26TA7i$1_sWh?$2Yx+>MaG@*0XPj=2`8_S$aR#
z^xGn-fUP~w0rQhS{*~sPaV<&T&isQ~!4jnydxPb_8K-xpA5_sfa@16OhRgH0Zx&9O
z*DD?1dw#XB#A2HU?{BQ1)$(`F(-nWjwc*6Phs~_@D>rsbWzN-By&cBzch569#t>cM
z*bU9EIPZAX=kS#=U%qy);#2wVjRNK8nI;|OH04r`)%mqzM_Aao?C&NDE5AKqIi`P0
zL51CV!ENR#N$e@20ZpI%{(X3L;27tNX1yXkmEY2{88lTd?-0|O`{L8&pHt4gT{mlT
z825<}PZ<o(P5t$%&xj|q;Yn0cvGMFTReiT*wmE8CZF9OhbGfwog4V+gaXk~W9JOUq
zGr6z0zYSbGCy&qL+KlNHRWpl4XG^UyPVM{MqTl(NvyJ=w{Zf1P4V(dUmNI+SKV~@g
zb8X|Sn|$f@RwvTjdyj8AHRqbr<{g&(trp6gFYTV;Td#a8#3Wy!y2@SpX+zC--)q;|
zC#2qCefm0V(=~}@mO@!U)q;$MIYHCoZ*2WL>Bd{(dq%RipY>d?f3@zPpm#0f)dr`@
zi;`tJ3|E=@9?vQ(eEHtx&`)Ne8TD6opSU-J;i^S)?rATxzi;&)mIN9vmAG$FCx0!V
z;YEb^?dOmBR`#1``mBD)aCcW>273w=hVVPIt-ZzI+@f}x?}XR&=39F%u)BXZ`ucv$
z<2Ah(BiL2uUfVcD@b(*<?R5uKZpi-3Sj>FnUBtb%S<mEq-@X#I_|q`Ks>9r;<BrIL
zm{S57y5~OXybn5&Coj+Pd$L^4r5n?(m@nTVSa)p2h4m{=+F2L6tP^tlc-^NXhS@`M
zcI2&Tsgq*rlY{=$KiIa<*709zVDevX?xOkgQe|iO8a}+T=ED1x9A^{%{$A%+`N?~h
zK%|9;y7L^Jqx!kp!hFtil7s(n?Ko4&{WCbSj%&{E*(T4H{@M3N{+V-Y?93~c!92#*
z;*T|aS^YX@@Wk_)8E=$5yXD$LF0lircJ?NAI^8@K@Mll6(f#80e*}e7Up=<`JW(j$
zs`6HPNXvD_ti1WjZn?`Bzj^gQBr*R_!z;$+yT7ehH);7C=J#~c%ROgQ_F7KVJvfc`
q?``ia_cw1&?|XB4-@Bz>?bElpL_KJHD#pOTz~JfX=d#Wzp$Pza+Ygff

delta 3132
zcmew$b3kH(ay<{br;B4q1>>7p%o9>i3bpOGsolNYbldf9xoz`uZ*K#k-@Uogx%W<8
zH<Y|Bxqi{2?+JT<yQhR)U0I#&SNl1A-mL02>%i|X?d~rBt+M0CclrOeyO&lT$~Qjg
z_b6_~g2b-}625Oe;OBFq{{NqNhI2dL%xT+t`g;A&t!uwNywhEO_4ei8Uxodw-(7UD
zExXXt*Vk98bm_?6h*ef5g^!M8I;cDFhDZDR`wM?xDVX8Sxaw=)ij5mLPOaY9*49=v
zmz!gq<blisJ`1*eD*e2_gTtdtMT9ZaVNHJuubKI!RYq1}f2<XH))`jC3S=8buI}QT
z&%`?0#q>tI2?LkevEPgv>+PgoZF5`kdq(uN4R%sBylab3NtfG+Nw9rmINi8+W3yFG
z(^rSjKMEbD$==*CNpi+Shc~mFH|>ip_fRl2VK~yz*L3L#b10*L1gr9)iM>pQJP!Zs
z*GQU)H9NP(io390ZBWQC=$OFxhxx_!1<VR;CJ9#<GP3j6J$!fKfWtwJB~tY}X3NZz
zm~z8+QsnAa6Bu^SD-)19^x{TL;9&)pa$erWwV}-li`NNW;Nd;I;D!tnqsfZ3;z#ze
zTQ|rlUG?aG=ElgliS725od;!<pJ_-}uX?!hh2Cc|fi?4czs>D9-Pw5K+=U4b|5^!c
z)sUFL@JuXV_qr%&oxN{%%@q1S=fNr76Xx}~iV__r5-NF@8?P&^59B+wwk1m5_llh7
zmYK`}ZjL`BIJU2uny8y_T8v#pJkUkxdd(ZhFF!S09xR&feA|ahljpE}Jg0N5153E)
zy>`a__vVFbSr)K#n8@f=+nQ|qDU;Nz_2={5ZH9t2Mj3PCWF}lYJC{$^gflD0!-FfP
z(BWmhzWN>0*D)Eh9CWO<J$PLaGHJ<mFUwO3DzTH8!eu6CJI?=ds3fm>ZoPD8{eg$Q
z4GzK5J`ouLo7osvm<Go%8J}x;`0vG^WZepR>EC}G*xy7xIUw+9RoKt`E!pgEep|lz
zAuSWb&G2^fgBQ|?-zp8>ny?4CPCB^r{L(4k);7$yomg+QDll|^&9u9JT38DX{Y^aQ
zYVbAXMAjYQkkwD-ojiVcPIuxx4<E~u4m;mHZCF<$eP(x$*CmGbYyC%}Ufkn#wry}q
zw%&5S?WB6ok&S%+)^TfC?cuy&GxZYJ;SGQFH~gG_VXyJWkWb7?kDnhjG(U95+d)c0
zH}pg8gz%edodSLTC)P*0X?}gvxPPzcv^ut3+kW>7ABf(6Z>OkOEz`emWiRd?&s8gp
zXl2~Fh3RjLmD$1g`z-9&8KmsjIkKF{dGlH9_w>W@_c9K2X`gGVsBdLlDt?h`LRHz}
z`}aBb{yunKJ1p6+o+;*M*p7e71&2LiK89JmEi`y3VNfBqxBB7xqjOE3)yr+^Y7V*l
z<@A*~|La74ecPQZHS6J;<%f>0Z)6R3{4KM>G`eR#H~0ER?|ANX8&0WNf2wV(6EC{2
zjo-@l^WUng3o;7*8~5D$RefXoow=89aEb3@c^1#8lq}1~dxcfM?Uu#=fAfxR`0F1U
zw5Eu+NP}<HoMw{)zvP>)OIf_uH2A7gUzGhT^O*|ozH__e8NB3{zQ2Ebsf@^Ob*)vE
z@;k#O@aKOzd)DSQ-}Iv?d9w?5U0=tX{f1|%w_(o-*C?x_CcIpqt~b=4c4$6+Xtk#L
z#;DZS3|X1nM-`9iEs{|<|Ffcy_a|d%Ug`D5zmu1&SvHSpURtMn-NYOYyLB<mCXCwa
ze_7YdWF$*(@maPdYFW}zs|cQwC+7Xmo^N!glmuLIT^O=(n*>|vDQ`ud-&L$r?m3&X
zt9n>(xRW*Sh4d?Xp4#6UiuO5tUw>9TR(r<2TJGc-jeq@DGZdYY?Aq0ISk^L2Dr`vS
zl&NJ_O?fW=|EqwSp3a-3@_zS^%OmzE<oWl{nN=xPzi%ajbX`*J$IClv#r^fu=FFY$
zC!%<3{<MM#YGS&H3TpD~m;JW7Z{m0p%k+95zq-u?r>MWW78_2z&wgfkRpv?HSD7hm
zoLM<I=ln>x@L=PM<PDG4Ke=|z{=g*v#`TQqwk!+&B;Q-SrrD`zy~fe454>j7GQRo6
z-XMC*elOp-Iw_g@`uY=DDxvaCruWS(zB2A#Upc-1$kNsa6|<HvO6sa?DVnh4;Js!4
zK1u%LeiXp}=XLFdckKmd&Kf-C$(YN?u(x`xLjIqbd+nIYKOA8#xOkdj;fzOLcKOav
zI%q7!%6U|C>)D3vd%XK=IZJXsi`TSYJ6M11>F=GXhnRN!x?7f@qt#xYSU7vxj1<lQ
zRg;eyJ3c=C@c#S3XYoJxcJ}bPv-IubJXddL;=gc*P(t2rk#2{~4?m5v=CXzFIhu9;
z)cPCT#{>NHwX(NF6`xw1aLqDgQk#i{?n{MopJ^>2k(<wNe{qkuTKuNJQnM_BLEfaD
zxz8-8%?e$bdHunKQ%h$o7jrnE8d5L*Z`Q8AX{%Nq@{Dp0WlvlnGpTv^ZjIAP7Itph
zR}>B^D$dJeV86cnZi@<!+MREgjs#quZ@EQrX4thbPeY^jl&#0je|Il;|9{l{RrTxp
z6L;_{JgKdm`Onuf&w1UxtyhC}7&#SO`h>Xaf3iPVdyuJAV}-NU=T|2mvgXT%@hGq^
zuaD~b%BRK9A)Xy@M^>_9nXEH|po_^Y<%bJGq+Z>aAj2RfdN*c*Oh%#ulL>=wd-ZEJ
z#*J(nBL(f{e?AjDsgT6*)$p>n%IU-f24S-*XSazte%#phL(pH?Ai=zM83X49E}y^W
z>;HXz*0(;EgGD8qp{d!x!-C`JGm+*g>ntZ(^3<0n&9x~~`0*@ggNz^F_D1I*w{L$M
zi_hDBzj}aif%pZ93;%EJWQ(wDR<KEr&%M2^cR`-~q$}Gbm-A-b1~m@eetTI{zu$A)
zv%TA2KAv84vM{ZzY|HdFnFp@CytGt%o@w^AfU_IIr_6Zp`KOan+L;~iZf;H&?|8xa
z;_pSrthzj-`k6i3^X_&TN$`|)u~zJj>*C!Jqu1`X*s#MP@PCNp(QdOor3*2O-FT~c
z^p~yNpuAJsEXRWT+%?O&epYpVe$*69y_&_lJpVy~h0oUP>+d2qrJPhKyYTh(_3e4L
zx8*LnnbWr@UqDbWQ9FEHOu*MyS8pd7ipa^mySXvhUH#23S<(9P7Kw!a|NbT_ZN1l!
zGylJNuv})Vcrw@DD2A6&(&ZaoJUra~-O1TGTIix~oOUh$?Hz@W-9pyI>=ds5c4uet
z>FRH9Zie64^;0a&;Qgx5)nd<C?&O)<R(?t;4+sb-Rw_Du!sNnl`##Q9lE!H!R>jYH
znvQC@_sM*0Z)xFCSfsPRUQ+i>@G_s7$L8Bsep=G>UQcn+ll?DuMLwAD@5h|~hg!K$
zU0m!gdQCMq=6~eEng!A`ik_Y_<=QT8S^MJDRPEzoYa%vI=hsN^>FSxDs?hXJQMG@n
zX7I8}7c*2`UTVC!b5>L7uA_jD(80Gsch|@7pQjnT?9Az}EZ-AiKR-J=+s);Cy<<Cz
zDBm>G>}zW@D?dNeovY-^<^1W%$<N<jUS6JldYZ0#@!JWn6m&XL7I{zCliZYY(x|0d
zW8DFvpv1!~gO{HZzR2!*?02NtrSFe6CLcGuvN~LU#$t`&Wj@UdX9s^?8N%+Uagr&k
zaa-8hsKdEt(M5gpzQ4b}U(z`3OiUsF71ho4+M9i*#V&H`jCwI`<;9Od3qJGl^YOhq
zvQp!$W5kp#hj^O36{?~-cQ##UdcME<`#Zhuz9;88CeGBU{{8Lk=Lu2G23`SeJd#m|
z+FmlV@mMH(&!6?gvHy?_-@|nezx{r3ak2Ddb^kc+XI%Uz+uKw9XPIQ)y}P?SKY>q)
zXXmY)o15x;MW1VRdGiHL*NxtG^V!+i+Kb;_y1hOBZ?~B4tY4p9Z{C0W=jZ1m_wR1c
zmuC$SSB_*a@vZ#(YpZ-R)5Aj18<#@X=$<<evcK-{Zr!LY0R~UCLRW=6oEFDpa(Vy3
zxIGmgGtbU84-YsjVk1_3!T#WxU8_P@ZxjDKk;!^Wm#Fr=)Y<iqf><K=99SK`e#csC
z0p4KO=yJ9X{p@z{sw^gbc<gYDQT?^k7oWyDJO7Vwo~F*$;8WZhwtDS!W%s_LKTH!e
z<k=mZ75I+*Tw`T*=~d0oPiqf8p7ki^$9jVee%E4m6g)hd!+qPOki*SDI)dvA|1D;j
zzoiLk8+0Q2<k=%KBB#f`Fx6AdF+6s)-q+Aoxis{UV%f5aHC=8y4LGGsMARp=sMIj2
Yg!a}6?a_S9z`(%Z>FVdQ&MBb@0EwgWmH+?%

diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png
index d68af39186ca9cd2bc755cad8397467a11844a1d..b75f892c462a12cae4f09851d019db23b286f843 100644
GIT binary patch
delta 1833
zcmey(x1Dc-ay{EWPZ!6KjC*fqRtJQHN*pj<d*zImw0Cf0;FE=)Cd3{r4ZI+-;ZtBw
z@2QT1%RJJ495}9FEGe{XnGH)}z!kY<gUrlF&Byf;%(Ai)Uv*5iOl<nWUta#c_`PlX
z{k^9ze^V6QX?AC3{*8zCs?UA5zW@2o<+ah*ua@q7$I9UE@Yc@1{-xI`w;;=<eRKL&
zHN<~j=KObMW5xRA+S>mtPATlzkzu%!XT8c&Mjp*z&m}q@3Qu3Ly*%?e{=nKlFMs&h
zPkFa)Nk#dT^dtBA&nbioKKIB}SgH}XPHjnz%mRVe`BPh`{4n<rFWGq^;%(!5n`d07
z>{(4`Z@u*M#$A{7JKI<5Mu~>ii)>fS+HxXdnuODNr_xP)S>0|Y^u8N}{qYpxf9X>q
zrFlK(aOm^3tJjn*sNrUD<W^j9%43xh@1x!~l@`~E|983;Op-hB$A%@#&P#>uY;Mr2
z^OiwTSwhy|J0E4KF800jbB6!VTaOHV3$N@g4!c`X-SFe^tDjG9OCImEyV3DZ$+Esk
zeC4B5fj%GRhZ!OsRcnR2S&DWjOn=y<$69w=yy5fLpXPU$n$M_{*9~=jcbxO})P#~P
zuc~-CRM#`ke$QKSjm2P{9RIVk3l*lhSr%*e@QZqD9a_CKd-kM)d(Tt5|0~;z*J*P#
zc)wq^=o0VRy1etTCJOVoSpOBU7)0i29Bw#Q&%u_JsSuF=n_+7V>z{=g0{(sn(wpBN
z*2sJ6KKXr`?Di`)HmvVrUR+MJPk*g#`yl90RM5{)l_9rgotNF&t;qF5f9ruKmD*kA
z-qsCjhWS^R;w3nb%#5Cz{MR5OFXGF_)z+I{)#}!Z_#YKNpRD+3%9iH}i#^&r73S3J
zoqOpV!>szFJMwHgo^C!6$W!Q=*kUab`#$F9ty}jVUA-b1q066CV7AEO<7a~nGFKUX
zM|Q+MUCZ#eqcBrpBkO__)yalo%Nn2lNGVw@keahmhlja7YqDv((7)G}YReh^JlvVx
zY4=2JSJ#ClJ7S-@{>?s6yNW4odAsw|rzc;oNjNs+#qj_u{rZ!7EjhiWSu&qPH%vLh
zR@Sd?aPysRvO>d6%?8Wbcd}{eEgEy@xXcP)RNKI|c<159|5~RyFTG`)Ve2S9<MHiq
zsqn1TD=YMcmws03y1^8=Y@6jVX2uJ*lcpZ@eSJaq(_sglnU{{8U!L@hgKe9_0_IcE
z>gnZ`;TxX5zPn@oIUe);`p(EyAE|iXqrFdZoo=uj*cvn#T~LZYb5~2i?<~9bOCOV$
zGp2315y+r+eRY$ai17+d&o7Twch$U{b0jw8&<wXN9bXg}?o7|WQp&TS*I?(pzNu%v
zF<4z@e<5?`?`~5*##<@7{?!!T^9la6M9Ay;<poFDZOXYB_D#18Ib^Y5X48cF&4+f*
z-8h$d!)fs|w>Ml;TisZ7oPT~P-vzcM>Q}Dwo20Ain5>;y)7kY|q~~;q(7yIl{2L8^
zHeS7B7%LQ`|7jBUs?Q33vx{$sE!)63;ceeROVQ8O4zD+VV<`I>Vz@kvr!=6tY7tK|
z*R?!jwchN6vONbr-hW}&-j+R2HcR=&pU`^4V^3#w9jnMMj`rTdw7}@5oXqC%(usN<
zJ3Ul;viPg*9kxjq%<os8rhF^vWlYShfXNG|buU@#xB2ASUxw_#O*c0^3AlOG>Q2<@
zT%G0mpB8<dmT$1`!nMGU%DP4zyBHFU*rI3H+}~YOShK3&tH%P(;O(9}^Z6OLClv5(
zo}tJiU2n2=tD|vCT$}v;yBs-lciV)gfBRkgVESptXZ&gJb9=t;I{M_-ohN*5PageG
zo&0T5_^~ai_t<7L{Cs#oG|O>S)ls+RHib|39+o@_-7#qsLwV9)YvD(RH<ujaU+k=_
zF0}Q>@0V$pZ?k$xv_@(Oao<x}Zf&^HWRIUlw7~EAId9(2uRo&Kpws<y+D-Ah>9fwp
z&IoCo`iI4=?&e{aZ3SVkoNr#SX35yd##TQ~Os9QKHOt9{-uoLn56A^XCaZi_ov1r&
z@5xfeS<+jcMjR|((fC(|W5%3^Yb9R=GH!6HVA}h4pVg=T3^QJ4G3<Nsd-2Eh+l3i+
zo|imi*m`kw47b8X`7`p_^#`|<Sp<0RiE@ZyeB$*^&%jbS@1y#(Wz4nt9}fSTk|Ex3
zZq9-=n=J40O79WmYB4-H+2D42M@irF*mVVI^PYbC*ZD1qYdT}bP5%5+<z=StY<Al(
zV!1DUIpn1LyYQEN3>tZppO^Rjw!3~>b)U<BT?Xsljd~4w4epW$=Gdjr-P2xQYrx>&
zytCqAw_d4S+PMntMTHyh_dbiAoy!nnm#lGEhU50Fgo^#^SI>3(#xhSn*Q;A^u~=V%
zXva2&8?Oz1AG<HOPQLGO`ux(X%2(wym|yq@3Hh89)?T6d&t$J{OKS1n>=}z9B@2Sz
z>&y4paJApsyJJ%Ayf^bi3rh~pe$D!Kmr}yhCzqz4YuvqV-Rt$&=jSb-aLr<2=@teC
O1_n=8KbLh*2~7ZQD~bRB

delta 1901
zcmdna_nU8may`3~r;B4q1>>6;%xiQ!C0h5-nwfq(?)J8~cMEcFZ_B-1vn}`bp2}}W
zcV7;*o;-Q?)N@PX%dh>KG<n@Rw{t7^&-!!pPya#v)eNDHJoWjT95e&iESgu`dwV;M
ze@ouYXU{^kLTBzbymm{Md%9lL!zaakwIUj39crhi>&IvHcz)(TP;V)2l6h$f>js-<
zgQG`}I$E*+XK0dXU^M8M<zT$v<e%eFd}>n}C#&gCKKGz5GyG8Eg>NEi+e@BI&y;EF
zQau*XP%I&Hj;*fq=uI69ZnM?@7RbajoIJ^TvnF~|<OasKavD}55qu(yvyD2E5(GEw
ztmB<|cJ>)*wW)55MiW+XOmwPm6uBa~Hd(~2PVk3K<z)-)=BP(aTdtivvB&hmCW+}x
zvz=<K6t6L!VGd({%D}ZC_Ef{ATUS51X{$W0i)}l6TXk!2oP>?vG1U!>bt#t<Hcv8M
zk*m_~6TRn;EC;K`ftJ)mlN`4mZ1LT0!6EftPU>5MrOcX{te>l!om$U#Uz>P9w7!ay
zS#pwMarJ!xGxL|V5q1~bSKelfl65m#5ykzhVRGgiUfaGUPnK{9%Q5a*5h9zlwIPc)
z;cv~1Z>(!(v0asRIy`sb#dQH|6*q>gTwN>uo~^em_~@)R6EDvTy1cjemUz}yojHtu
zesLXWJ8JO9Gs4c#ah5#8?YW$#w=z{GU&;`#*X%nfe{e>4hq5Pw<e8AmA~&BhvHqAH
zAaTG_|1ksm?$|k(m#QqP5Y>@p%T9BCDaEiNivLyq^4A`hbr}7M;$Cr2Uh~Z}u0Muh
zb6rDY`JprK6Q8Es*urC`7g~L6`-T5}Z<CqpOg}Qlc&uWHKAx6%X0_2wUDgIRYp2`E
z3uFDhme(^h<Z$}@mpZnW?VOPR!s$zbHEeoiqSK$1Zzx%@ZtAbwhu=+qsrRUnw>#mH
z^@b`n9VrgeSqJ-`3%2t7@6r_U%Tbk?n4@=T%0|X3x%}*RLNnwRlx15ykXvzCY=gmP
zX^l8X5u@D0bqa|MP0h*|Gt#^nS5G`2qB_az>ehbV1$$iTkGnO?yF9<Wk<sLpU5g7#
z)r_4E=bLxQb~pyFDa)4KoRwwt`CjlO2EGf7oy!7FYV~IHE>l^SX1t-XuU8>OCcgEE
zD2KLH!@YE-6JnDE-oD;sWwwneQI)@;{wu4yifMGi*+Zu^1*QmkS9X5e+Nk$!2bX}n
zf@6`p%GvM1tX<)U>gSw0V)R>GBX)K2thdwmFsbbC_{jB0|IxJeER|S>$M>63l{-^j
z8rX8@h-GbW6=OdhqQLRn^P9rt$eaZgM|>8Rq*{ECdr@qgbCcz7esivW`#1HMI~$s}
z7AMA)_wGCU>gb0B)rzs59D6dQ`6k*h{8D0?Vk5QpzWhHCo?YUn%wE)&CjK)2b%tf;
zwaNuqvTtt}sJndXmJnj)5V#c|T5x2+`{v2^jUV$5HQf`KQvKK>bk*OF8=i*-Y_{BJ
z(f+?|Ta?=R2F})ic%5H<7H<r1d{2w8lQR=#JW(*&c41WQq{qI>e^)WonM_)I{@k|X
z6Q;~yzqy`cK^see371;^i)KmTyn5${RZ}!gxvp{Fa!szUW#yW2ne|!O<eeY(%imZ3
z-rC5;vpV*}BsEu?1C#E$@lJ?}e{>*1VSkA9<9d0i#dQJudDs;{3o>#!RcwyhHAkz?
zy6v#VeYRd13wef14r`P9EYsSj&i=*BHk0lD;gg9zQu+59e3C_6UT-|n!yS6y;qT`9
zfYdo#7YeA)t9W$o?(XvW962maJd1=be7KoDzjle^Y_mUeD{BrmwHGn!v3oTtv~F;C
z8J)lP>oN8DHJ=iMYX5vZ&cFBPv)TUjACHP_@B9C++V01L=85n3|NpmG^!Df5`+mQ(
z-uLfU_NTX9|K^|G%5;KdkwMP0yxlpkBNjVOn<c?i?>T8j$JPZc4P9IeDo!0&m<~*s
zA;b{E!Kt}~p<zkFvM^<a>&I^Ywoti}yEDOlfAzb+|G)45zyJHc?|c3$1nW&_zpyfF
zwfqa_X(79hCVga0zI*4YnYYjMDV3dT{M07DbX)BB!XeCRE+6~s*}ebTtM>Y>5N3JX
z``vuP(!TXsTctefRfIAV{vLX_)Z*IY<~(B^vD2~3FGqg+T|4Py$~;5X?GC2R2OGXr
z**1L(+YoX=lCOQq)hyjc)-#9Khc3BXX(Q*aDZ+KN#%}(TqMhM)tUWA?9oRNqh}01i
zUK+Hr>u8c@g52rG%NZtFCK5b}jshN=Z|Yol?xbVku~h%X1<f#pDfM5fY9(eIuxn!$
zm+MzQZNOta*KhaIpp~sb{SuKYT^2Ck{aA75sYPGu?z@o*r{6NX{`^z!WqSF=MGG9n
z^W${B*Zkjj@3*IMK;7OrZkzMF_r$#y&Iy>-TOsv#;{EsP3j<a(z1gVDG0Q>0VUC}=
zW3@ZO)rlTUgm~CG&!$NqY@5TE7IXXUw{0g=s?sB`?MQ!k_tM<jG_%?18SlL~4is6)
z<cF=^$}3%Db)fU&m7smMCU+aoe}Az0*|*M)rJ@@xcm7{{;A8W|4Rsgl*`{fn?zpkx
dWBm*E&HFs{Pei+1U|?Wi@O1TaS?83{1OVzzhr<8>

diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png
index 15e419b7ccd88651bd21dac36853a827fc4075b8..36e14c48d14a8d3e5bf37d3caaee661061cec3be 100644
GIT binary patch
literal 5666
zcmeAS@N?(olHy`uVBq!ia0y~yU`POA4mJh`hDS5XEf^R?UwXPYhEy=Voy(dddvaUG
z^>tYm8EpoaQzr2o-p|}WbEfA9&kvCwX3h*;bKa+4vfXfJqT21kGj?~D?_T|FaYUDi
zdBX9NDbMDqZdEXR7aCeEzVqj%CGP!lwpCwVTy*}yvv`$QN8_g5Kef-FKD0Yt{p{xZ
zUcY}S2>E5Ep0`fB{r$iFv3>7s4EReJ_%-@D7p<Cq)GfJzVMWnf`@GW6@8vfBd}KUp
z>-=Nt?1f*Myyh+_@?F&U*yXUg=6jpN+vKhN3yWo6F#LSw`h_JzA>q?pqn95OKisbU
z{O9te9Un|pHvhg@!n0AZW5K<}vyz+mE<TXdegEeGkKJ#x){CcG7GLh!vyJ1){IY%y
zyX1rV49=FHcl-RkqP?v0TdHUF?~64O8cK;R9jpw=0y;k$N<MB#>CL^h>*K1h@BgOp
z?f<c?b+H5Q3x>$`>~|`6a5gG!6+W!9Z|`k=x%#e>UlUSJSD%c&eW8X!g-Otbr-Olk
zi)$6vE5VZ!Ee;-=Zu-L7-KOqV__z8OZ1EQI{d=pJPb4fg|H8tNFmq-n=Xu_Xdp7l+
zJ7RZ#PDPZ}<nxnt-yJSJ-NB+Dwy|yZ;_WUPvr;``)a7PgJ9OiHhQ*&o{?+P-CZ;cw
z{Qb>p0wdoA{pX4&m|P#7+mL<6R+n48wl_1rDs<A!-WBT=%-ooo6;=mmDKI6r{BYc5
zEb5t;GhxA>o+;(~yi@P5XL#Post`Hx!_zCwEE^m+<_ezszx(jMvNx(P&&eDs{l4gC
zTFiD9Aq5Emhid^|t2bRzzVfsq_L$pcmaC_Q`sXhBbwph9Z<{fXy*0Z@qsWK(>5Oa}
z9M1I4mrMEAe0IXy?{7~|SDjlqF{e;^RjA4i+Yg7T7Ti0$Hp|n;=|;UPw{-vPN7wX&
zZdEOdzENm-@6RV+39km83+HbOYzaCq-VnI!wEwSnSFU@1{-gaU%DRC`h(#jVvE{2<
z52Nw|HWQUt8;zHj^4^}STbgz{pKaOu@E6zaDa*}bjnWpf<F`G)BzWQcpV_nTzL9oc
zQuoI7<vIJ#>^!Ht1_jwWB02YP@UyeBGK9JoBpADh8C+~}lRlqu|DgGsZ}0z3ICYRS
zfp_o2kAeve9nX$xeZ3r=A!F@P{Axm3`h100tA^}+m(LQ0VuIXm5vpAcosPDPUfxPs
zp;f`Yg>%(rmRfZa37cgf4-|%1{&L##eEG6QvkA=aHY+$Voc>cY@y`>3lT+W??fg0C
zihr%>+?VMm#MlKo4@e!Ev2{(brkKcwrdf;^WG<>n@Gs&xB{ga5@#lOd2V2ayZg`n|
z`TLQ4<6o`GJ1V*kt&?)DwrXJh^LQ3BkATx&AA{X@{r`W`zH)xA+RHO_S8^HN9&S-w
z#U_}zwuSk}x}#UEe?;}%;97R8{cXO&GTwcsr3^YmUAi;&d0I_Q^%eh~{r64eulZHC
zd3G({@Z2bXc~?VqOjW}MqdN|k*1z?)&wlo?dD7KBk1+eD_iqkwb#q7(=<ZmvbV-Qt
zVyQ)u4P`<#ySv{%`Ef1HgQbQ!_~gdLhGiNKKD?2^mG4E5F21LzcJ#>SmcQbra+d3t
zzIA{5fjfT5uFqCaol6(&o^?;aX32r}J2i{FiZW;H*?V@|IUDVhSHwNi<};bM+>qGF
zX0Y0ST~ks6!&&D)@9jMs=I>{mYqd)3kHo%ti;OStFmNgFX`EtE$gLTn@_|G0d%qw{
zvqbYQhWj^tF3E^rw^Px(`}At>tY=62HRkL-G;RH?#)Y@TqUGl@>|DD<=Ggf+pSKF@
zd;UG5J?U!s!j}q4sT|5KJlU=~9+Kxd^4?k1RdVfNFK=&1ll*yA^3EBdv>jeM$_1=n
z37VgsC7|rUaP2`<^A^2*3U9rwHS?+({~g}%`R&$wfotju)>qnJ4cgFWd_!pKcK66V
z8*b~@e%a%>{QrVw$4s{`uC({4xLvTh?Ufpfbwl>ni(#^7qklXzT5@u)bkwxP=N`*U
zc)sJNgtbkFY=Fu;rG1;b&T2CTur&V?*uwvw>(*R-ga2{+b>`oCeCXifZ9Bdu^Zot0
z+Tm{^*9HU0t<Tr<x}840&HMQ?M$hT<CZ4-$zv$*AznIlMHyC;x*R0vLE09HnX?aH8
zhor)-B5SWIWN+5m(;oJlpXJw4p?$V}WwvDsPFzhlcqYnR_&r_G&4KmEBdggbZY$5y
zeQRcS_f_$!e~;_0@b4)MXZ(5mf5%lJo8RIr1`cPEj6~GE-ItU-l3rT-Rdttnd6TZP
znv+tq0>8<fmc={ry5eJwTtDwLuc=Co<!`Ov^mPx{ag|**csMzF<5SinCJw=2y&Vm_
z4!nnEq-~vE`-pw(y{eb<m`dI$znlI+`rq$U7V^x0f6m-rec?*KefqA?`ipML)$9`V
zxxcn^3S*+L>0aH1dWXtI=I(8LuaaTjxQBby8t!w=%=Xdhr`Jym7qO00xna?>p<%+k
z@-K`N1WZ*Q2TT8KeaXoCc&^ZF$9IkkRz8e#x&JNx?5VzxqPO+EB2mxHCtojTpS${e
zR{1x}OH0)cOc4-HU}9dAILCYi%Yu4|iu-3j%$13q)o$}Nap7B*>wXIa7JD+t`4}FV
zq8r3uvh%fypDL45jk$BN{;jXabhjt1<l_}=JzC9uVZv^~`rq{to|{q=)-2eXUi17_
zxYyq|uXkKy`{1g=;mDK3lYP`hA#_jQg!A*<UKcNz=k9jt=-;}nCOaE$nz-#qpRhq^
zf<Lom`H_&1JQMZf9HoONY-3>AxbQ|twcwrO>{Dm_SwB1G@vqI7EWWN<@|j&_b-wFn
z!y{7;L?!fYj&NMsa4Yf8C8d3p!e^IXEK5FnP~0&vlu;yX?vFw~`|Tdr);(5BdNWP&
z&4R-sRu67-ED&<O(Vwl7vAHk*zRP5@6iw}pHJ4p<{y(+<zUYYeZq-$_OH;hhKfERV
zF_Y1du|pw^_i8U=u#<87vaWL7nzfB)*$;xQ^=Yl@nkpi4g}s6C;XdgOtrDRc_L{}r
z&sb}GH!`zXn{-I^TeX`-|40+Rzs5P}Io}Cp?t;_7_6d($Wrfb}^AHgg6i#y5{A&R-
z#|y7RQioJ^|2(p}wPRb|Nztz?+uRaFt}4w?YI420;n3exFVc2s2k&?-W`4>4#~U-1
za~Hnx)cyLi_~wL@`7@@^(f`uDQtWW8bK}}2P0>kTB{+V4Sb5akcuN`QTeXa7YvVtJ
zihf)v$>HZ&`hLn3^N0Cym7dY3AD39W_V8Xa39)Klzx|}2Um{z!M?`&zb<(3lzWb7s
zq%s?}^vatlUG18o)$vkkR$;h`gQ-((NBp}7!tG^2TXyo?+qiGVqpOL#%+CgVb-w-m
zNH5RKFW%;S7h-O$eS6LC%JCIZE@xI%Ilua_)lA4_g4b_BE{;Wuo=ELj+V<6(_tQj2
z&TYLy9mhpjdzuP4ew{3o<F-5STWO#4qdY&S?MDy0gzi<_Dt7bE(@W)ln#+|euKj#j
z|8I-+_lMe6h3~XItshr}vvM^!mbxDLwK=Z4u#0Q^g{bB~F-^O<%5HNU?DM^Gi7BJ#
zcF_*TVAs@+nlz?d-$jvZnTHM^Z(bF3-LH}D!o`KTx|SlbIyOH~-TrxU`@Umm<7#i}
zpM4`}DCj14u5R<XVEzDsJoWMy@9lFBT)DXJcxcg^2+_;(iq{@T>(*Ry+?-jnm+h`u
z#qq_5IJ~B5O<<0Ex%GYA#ElEKwu)Wz@ksq3nR|8ql_eWHmxMAIcAVJtg)P+evrp}`
zJHZXVdUgvJssC}3-G8OQsc}KH+6Pvjs}EoC*3?Q)vf^)R&~0E@AQvMY5O%VwLu|qk
z`#;mC?@qtJ|HP%<;9n2*>!+TJv3w++zU{#$!H?^fykZuJ46Sqc`fWo;Z?Ne>-{eC(
ztGNV(KRHyimbL%+!cuoDp*+%K_5&k131Np!;~QE}G`Hy8*%z@bO|UTWja9pu{)W{5
zN6&Sd^Pl<}tp9rH;`pc1+_yb13neihkx1FP_FnbI#Oa@I7+rn)ex|R!+}`C4_kSzA
zJJtIAeRZLCF)RNI=6Qv^zXHRZti|SLE7meD(`RZaJ+L|1oy%g?Y~z{pGH$N1e7o2F
z@SP)qYtPy}oqhWE(dV@lhjz;g1-M?lpY=8N(Le1FvpGC$+hnKzi&f7}{rtq1m1XML
zJH0$10#6f_#Z;KMGK3a;w&Zi|x^cimP)qQ2$NsNRTE+M8V~)3HkMa9=-+up;xxW)G
zu}t1<m-*hfa+l}jJkbp+qyhyRZ=VyiG)+9*(rWo=-|C>7RXR65+-d*T`QGl}hkcd{
zYW*4C{7rjz?*HS70wM?bn)8mt?rzgN|Cu8|fw^cw?2mQ!djm4g&O7xy?cUkC$M&_0
zch@|U{_^bU+=_T62GLL#5mr`}#Rdk7$?f+xvDg1t-fVlnf^P=9(1Er7qEkwvUtMB)
zrF>*YsqUp7)?l$ig?DPW*KR!!>l~eXt;>+D$FwqFgJFaN<I{I9x_C|{Z(VTLZhG%q
zKW5HzyBGHQzy5QD`_{_0S8cQ(U2@2|vUFEr=c60Z%eFk!jh6bC6xOkhZ9~oNk0P<+
zjQ4JA)=<{kbbCSLs#~GqoZG{>R&}++aWT()pW$#j%Ed)|Z-z4K^~BsY9K0<-3+-j(
z<e1p<tRKGFVb1rV=iLnby4UjAs^RDT*r)gTJ$rtChvw$V?KRnji#xR!Ep@V$xKn(#
zN3QO4UiDvz+4JV}yp#K}m7C+2d!25!&fCK+wXZhWT~HEGND8>R+dJUz6Mc=DY^^SA
z-?weq7xU5lnf9$&n`|_WISDvzRN5AFLuG1)ez*I2l}F3DX6YM$x%78lLiEaMQmwM=
zR&Oi}<m=D!&pm7Z<LBCw*8d+*=m`zJxhvVM=Ba)RgIFWm1iL*)cl7SAw|XNjJ2kfb
z-syF7en{;%sEv^Bjy}SssKhp*dRqDlCAH-icP5-VHaV@}zvLp_d6)Np+_deMdw|0e
z#%ElNX*t&0L^@9t>6F#YzcPLG9J|jUA$LAH-ur*%_`wU_>5Z-mkw<ww*;NkItadQw
zS|uzMsXK)$b^-4ahkpUro3!qKI<C)TK4X8ObN#RRW<|m=*PWhnWTwwod+Yc~C2r?z
z)$O|j)-R9~PGvb7sre;#>Z@hlX$jX0W$Pa9$~Smr^)hZ#3@6hymzDCGB8Cf-Rxzr5
zQ<}%H$??;gMRvCpb_h<^f4ijiok*>}+~OU-tsbvG&#Z0h^nS7VlfUizRUdxXu2cW$
zzr4Wj>;HenKbL#rFg<9ipv~{E*Y6y7F@>k+>XOTACX_O5I<i1WNh?U;jKG5;{U7VP
zGH2{~zwG7xUmNfK3*E2MUjOL%q2+b2_LkHoMm<;8s(N_XS>>#)MD}~LDW37H!Mz7`
z8#Hb(91<|z!f0*dV<4b>$z_Y~0;Xe3Vlfl<Yz&Oa=C3U7PwLHJdh6Q$qi4s%#0SQO
zzE_zw1N8a&;$CEJaM}5P(R_>A8@{nsam)4ht@B*n<?XzO^-AUy^F+3?niThM`xR?<
zpWAWhLCTz?-42t^`-iSMEgKhUdQ@wfRg3H!XCLWA#v=Y#`>zBFa)*6b=e28t^Rv3w
z)msey%n{or6TI)?^|klwru~yOc3}U|zrNvi_Osk}>5pc+4itU+_J5Y^8B_JDnLAJZ
z6^VWv5j?@`za>|5y3hS(lOIl7Z*lrx{(RmAyeSN~a;vj@(^k*C?Pi#kAgy+4f`reM
zru|#x7na-TE#hFivf$pLN}sURUEb_F|H&WUqgk8N^;rJH75o1cOKbk<#9TQSJ}LM|
z>-*LJ?-h$$-1>99{!OH(TAAm?(5DB=_ek#lpnjgIIDV0ySeW;_1C5e58aTxTV%e@c
zh852VFz}Fgc-gV^?y4oHciT*rwL7V7T>s_M)wJA?!GZ-}U*&IP6S*L3y{~8~*Nc#C
zHX9wT#V6QsN#4K5V)eT8@CALnW`Bc&3`UP81btxWE=oE&&80c<WAWzdShISI88!LO
z7hLuG|Iq$o?UkPjKlUnoV$gWNaJ>BA%c-X2%s)!^md<<nBdYz+qT^?qBsZ`~pZ1ue
zWab|5t~H4@+GR7t{X)0d=1lk3gkBB#J=f*wza1I!mt{7vN%A*ed6;&~fZs@fG1bS_
zF!kURX9f2opFLSrT}&OUAMAcGyYt1PEl1NAPFW-%%yw4AeCqM|1IZ1{u|A&Jn>|7|
z%szdDy<H%ZX;%#sXQ1Sf<$^ztWS+R>|L$C;_!S5L16kX`Y;G}_UtD;5&OsiHGL<8J
zVh*<xxHLrXd=i><X3<@qOHYIkeqWWe(P5{0!qq1Ww)X6qrr4~wPdDS)1SNs0DI(#o
z9d`3oPCq`e;XLQADHHXVKAHaN#@tk&os9d=-Fm?pVZAD2bK@+T&dN82HR}Wd4m@2^
z?BIVOxnD4_h~1=`$<B1m#W=nl(#IM&EZ8*^*37)U-OX^Plexwh^T$bZzO&rvG-%Rj
z*}~_Sc$9h5wgcTKRXcWTeNzkkx=X)D=iGJ^>6e^8E_p2N;t8FwrtO;SzD&ibrt@x=
z^lb>4zR-4Z+K&hRzkARBDZ8})?XeTrbj$aByer+LBgE#hAfd?T;9}Y4ZM;j^Lf+kb
z^-*q(^1FO*n}R5>#p-7soD()GdJ(bZ^VHL=m!_#bsB*q=SAcc#j7{#`a=pCwqP8ra
zAa#OOghhnaF;X(mpfL8(qd6gFanlzz=g)e(evi1#$J*s#ee0Pk^*PtGF$aCS(d53H
zEq;5(VW;Z_+^&1sOS2BSteWj_`0va6BmQ;o*OxCocv3;<Ec05^f0?J|F$gjQa0^wR
zX;{R{pS$>$plgg@z`UvdCU$Sz_0m@Bv3g3u?Mk+9vEAB_br*0y*}JsktAJnwOP3p;
z(t4>`bCL>YUAkZ>Y5Ai~=FdCxLv!yJ{N1)cYiYx`YfaZAH#(e5^wPUJk=ea@qw|0L
zKey_CbUQGgkn^1M_H0Y&-YWfl!KZI=pLX~h$i9EO$U+Uhx3{*;V|clkug;&LF7?pT
zhHqic)n8b?uzY#J`GT|J%nQyFKTTe87FhHwuw|UjwD0nVS*ntevsRb$h4!zq`%x(p
zD!8Y7XZZA=^BVMf`KGiP`xy9(_<MxRVDfa3ePz39mzCtI?M?4P51A#F9&=gB8NEt}
z>6Z13uS<XYlJ`DUu(9&%73(v;eJ}5PT^~68Yqx;nwRO3x1B*LPZ#LTV{&B*epWB<R
z|G%_!UW4`HMwJ^Io;rt1u9`Ww-L5G3dHm(4GnIGk*}$63d_W=To!Om+??O{{O>vVB
zy7^(7$nksTPcQa2y?WQ}a7gwgXMlqKp9k@?nX*cDUyuI(HeWCKaHda5#N`zANF$$n
zy@G-vtPU=Yd%Gmsd5sS};^lI_o|<r6)VS<}uUnjr+?$`TPCH1Sh;X-i|6Y*0F{|v`
z*|&ACUKQVDxBstO_v}@0RJ??UkK-}3DC;=KGm;7?4HBcdm`&UG{%pJZ>(~1h{Xd=j
z4VDY|R2rDRR2|S-f1vEog{c)wZgcSOwR-vM`2261uh0Kje0@Iq_4ScTLW#baKbpd}
z%WQq@zfbq$57%&s`7F;G-!<=7-tpZm^t4z0kF)=`>hAw@@n6lSL%~8N+r18kFWJ8@
z<<#Hw)E@^re|)vCZ#r}=b;0Jk_g`OeGE9i}XPMG)@yqi6=ML7qJH+<+QTbe*9P6UF
zjdQuP>uoFj9s1`r%xjRdWwd3qtzxpPtrM?tkMU3BT@b~<z`)??>gTe~DWM4fD1zc(

literal 4294
zcmeAS@N?(olHy`uVBq!ia0y~yU`POA4mJh`hDS5XEf^RCdOcknLn;{GOl>`-8ZOYX
z-|F-3Be55D{APA7;1sKF*zr)Pe&&wdyVvi&cY8N`SKG5q`?}wXn5@0b9Qyo>LDG>9
zL1nidiNY|aGfn#Ue|`(hw8)urz9ZO;C0a@CJi}eCk{NG;W>}Y3B;;0p4fV5jdvj~+
zYsCcx*5>=|5(@t6UO$(AW5dIr*QAvF?8UA>o4@<Vqwf1md<h+0pa1`UzyItU!=4>q
zE_v%SY)d#3xY1~)kHun%jYTGmZMnC%{oTK7R~2VE(_{Mw^$83@iym=3uzQg8{mV(=
z@I5_Jj3xS}4z}AktL%4W_u1ZNJHF_+!YjE`eByiV%W)W_sHX`uHqR+}H;-3g+1_h=
z<>!90xG#3uQ(x&}MHkE6<0i>P?&gc$71lft{%^<A@|OQW?SuEqcN^{=SbxB8L8*0q
zq-O%d1)0?X_p)c4;jo<Ud7bB?W&fv2h9{MqrYVIrT;Z4;k?`pN3)3cnbvxzNN}WWP
zDIWTJ@N*+)`8^r#i$(6o6qhhQuujOjozW-k${_rI^3<UB>-G35Z>4*lubNjOb3NL)
zVN%BVYQE|dvx@3}i9C8?t`Kdv@~-(3o2T=#E3-INA8Pur2nwILyk^dJ=a#SBDwbi_
zK1==(vEX3YAb+FSh0$h5Gt1}h&-)lJ^>AcXo^rGdKBB3<zh+UvoZlfQ++XcI^m(Gf
zM2^Y2ykf}*)Nj}-o>D(nleo~qWd5fY8(lLw9J`j!IaztVnlFcmuYl!DVaW!QbjGMj
zoI!?}-_%%M?c)e}Z8USH^SPgCDZOVJl{^$}M2#d{K3{YFpyv3)*zrg3qs#uA#Z`VP
zq%3p@5%kR7`%ZQule%A6L+j2@n{5l7H&@<NJrH+uees0KW5yq1uFTJUmgu-?*)QST
zckGePZN=58H7TFhG0d2^i@_sPojr8*Z>@(D3|CJ-bnEH6>#^=`H{NjE*v-1_FB_Ba
z`v`|0wc9L?oU=S|W13CkpYnGf64U0L-I{wl?Z@`q+Yz@r%A2?5+TGq}xo5jr{G@X)
z=X}{?c{%;B>eQVlUhn(4Z`Hb0@9fV1-|Hs+uy?wPzyWES)D~T7yPHg%>X%MDPWaya
z(rns(-uH2DPfQia^V@9jP$ltmrNLtD70WneKIp#KzT3a#*5l{)jN11;vV2^m#3m%O
z)9&E|;rR~_y4AUKG;H`aQ|(i`oR@(1{jZ+i@42^~_<r<5=Xcv3pFOr;`0kdj@p0mz
zgu6O3iuiTvnHN?FHa$L_$>bNVvr1Dctg-j=mODE2+-B#FhdN%pU?DW?m{WgUz^ZG{
z7ie@f-aDLd{qx^f9_w|JS06f5Kl8JI$@<?*xDz%cn(5TW&f8bGP}_R->%;^141Op`
zRB@Q_=wG`m<#s@Bx*)H2+)ojX57+)LsNDOSF{}FU1LnjR50&=1*Y9(f$dnV${Pl0b
zjfVzrJS2()u9$sb?GklR;kOrZli@zx*Dz~dO=G`T!!pByW4T8wtK5xqW^m6IKfEpU
z!Zuc|w*jjfcd@;&jM%^M{JB%c^Nkr@Tm+xw$8mMf4;Q_<&on@8)|zdBwkKXp4q>eQ
z#kwKq^J#M-B_}z%=Z+Cym?9*vD^50?zq5D8wcZUMBqRPX^;oi&S@LI935uqj<B*j1
zR2JJ`8D%XS_fz>yebDJ1$)=@o*34(BWAmDK_8(eZ-}ZI$PYZ$l!e-nn<T<tPg-Pzt
z4V@LSdP#it&CgQW-uu~ZY~s21hkwm12ay^3xhk%?PTA#BwQ<7ozWsrm;VH{b=F9Nz
zj%RZJ%jma{&2Oh)eg7ZZH)RuNty@!neIM_!*yVbAGJfzy)d;EeIp|6!{^7l_Pj-7Q
zPk`XY>bQ09ENi8fbuC}V`Rgs)?m3H7Y(LlC*rl|$_rf;OV|N*j|MTrl%sWzeOvXDv
zL$F+&E%@D~qKWsa1C7hi#3!6kRI+bb;`R6V#rj4?_TSZ(8!82T_L_eSdJ>((=o6MA
z7h%XZZO@yv_m?euw?MvRZ`aD~mgahuiyKPbMf{LiGt;TAWY@pt`}&m|EV(P4^UhSO
z_zEqPz4UkgQrSy3D^|5K?`P&ceED&d<TJaVGexf}v^Mc<<#9gkESD3zjEnVs`}^-#
z-{qyQFue2pblv?$HW&AZ@fTKf#I?1Z){~qT-w-P7SYA9qlE;qW*LzQ%j5T{NnJb*i
zon<cXWMi>8L-2v`jJK>xhWyvgez=|~Q77_o!hw#(fBr10Oje0!_vcvjl;M5pF2kZ}
z!3}4<FWu`u`8#>heYu9EixW;y^0*g&QjDd08_UTPb3Y!=h@1cI->iEViUk5MW^+tg
z6&yI>f5_=WQx)HpJ~6s-D6nB7_tR-#Lf@~lnkdSjb#2wF%b%y*KGA$`?|r3d3m<B9
z2s197RWeh<v7@Um?8S|F^OlE*sxlqA7w;m<GE;DZbZM`Y`?A$7x5I@+Z-+Zp7gx=@
zrB-y<)#1eN$7+e@3zYAQE5F|=Xx_F;^V(SzC6jsHpElh)e3Esd+@qxjRAhP>PDd}a
zSifXZ;k12C0w1+F+dL*+tpC2k(m}(B^Y`Rg(|bO1E|3<Mx_<Lg_|{UH*9GZEqhEgX
z+!8sRTkhkumpw+>T1<N;Tuyo7e1C3fAh$Y`tge-Kz|u+Ld2_sWOG{stUD_si*Xz)~
zOOyZ36_sOLc}Y)R=*C~A|3|!kn+oTjwO=<i@y^#<KYixU$lNEvY2m~Y^1eWH$^)Cb
zd;L47NH6wlTG;<m&z{q1QM0J=+pMswn+|JiKeYS$!8`4L*D~6c8~k}O<2i%e$BX_)
zbh<0bqo(dq=PPzH{h7GGW%<6xWtyH7IKCfjvn+V<J>kH-mPVVpKfS+Ra}>*dTxfqe
z|6Fg6vVV%JOGCoUBg<V4ceS@Yw0d3pF+6IwX9|P%@q@wln-bekSF^uUHd_7a>W<u4
zzU*aMmFsK7wNn3YYvC*H`zc)V+n({m$#Y^(9h+DzH`P9>4cx^kSo353!`^y_oyvP<
zrfP5aVk((&deJVwB+W?cLx2A*-6hud%a+H=;L*?W2#&pda*Un|Qau_se(Gq1zRvP`
zt=6zb;N7Z4XWI20YU3?Fl^eWbS)}mhtE;=i15-DX9$Vk&OOMxG-|AxQ-@tH(xkBps
z0{Iu2r!zjB;NPoO#?s<qpse)e$(hdoSA$KB-!-SMcV61;7Z|$u_RZ>;b?Jgh${zc~
zdBv=moRkudxHBE@n~)p1EV}r0+N&zh&sCg3tFAH2`z~Ja|DMv`58gWyKPkI7@CmMQ
zY7Cfqr9e^Mdd()~C&#TPl^pps`N!P{&soJ3;-6J?mnkgnJfsxzBSt~9_2l+NN(wrM
zo@;PT`W|7~EYi}bz+v=5um9hX<CR}TEr0X9*J8Od@j-{{WVg+4GOtB;F4<)ssq!-Z
z!^zw)whN}FX?}01+Vry|;ZkQ?;REB}v*Z`vK6~R6Uz3F6v6FSD_RYJm9#N-0{d39n
znt1m&O*OiP)2_F#6L&Jd%F(>9UGD#mquq`R<hHDr+qC|0xWkbsU5noSY2LmEc)wjM
zlKF63E#&cZ4mX|y&vrQT7_<Ly-nQ_w@%{o%wVb1iyEOT>C$H=Jce=mS?)R%Y`?lBS
z9fuzC2s|)USB`&hA;I-jV%V7oRqiHLhesAINh}?@@6>bj8LDNpkGRZH4VLKr&H9gl
zF|RmyyV#orkGbQ73+C5Nk$8EBKO+0W;{5&Zj(#}j{$T!@#Y=Pq{)d;WXV{&ZH*4AB
zzkAdL4)`CqK09k)BYUC^=bS|y5*Ced-?=OJ=WZzNTdVe2OV4G()Nfhur5cvFH)p)h
zxn@>>+e|F|_P^ZQ+aSnx-}TVieWw+_KHakZ)!NDLGM5L>Zsc7(MK88$ip|8JOYZe+
zKCm4=)+71(5W9Sh!NCvq49gGN7C$?quN|^tL(n|Q+{>pK_zfh^Jbe51Z`a*C^SnPM
z6$hrR7rMy$fulmY<hA7i`JTMX%g(NpHqXmB+PsI^Q_;kin}?J0WXi+mlaxwh^~7zf
zzPy-oHm%yjii4m3e0y@||9O?q&&_RK!YrZ4apFg^CBK}F#p?nKInBC+?~5Iq+2*Fq
zZhC&e+P>_-dfV07TxVF1HO*Th5a;mv+S=U@X8!ufqVV{|sj1rOyGvePn#yi4_xBvT
zzrVlBKYst-zlLXy>(^ITS39_b&7ImE&#mCx#v{3Za_R4HZ?)%Im1ZTfd+5%2VERF$
zB3PgK&V$+y1{F~v;jH~-Z*PU_1uSr|`E$TcW%bop^NiE^zRz`Dv|)0ctl`Vc%a<QG
z;9%D`&#u<1=uGL{g-T_&x8*+W7T3SW*7w=rNwAZXlTFEqYnxI}PyBQ%^dPGpvwWA#
zUCk3(AzgKUe;qyW!!)3-qxWFjoz3azznNxV%Mp8i|I$f@u8xic4<00xN!eC@YB};l
zyu|2%;r!okx8J`Pw<=`iq<Qn^eR*@!*r{_ZV=UXLkIS5!3yh>S#N}ohr@v!(+`duq
z2BX1~&54KGdes+oPiob>^j_BIQTd+-=YJ_opYvJi^oi;E@s1W3uAe{OFYhIHY}cNj
z&t_j$vGQ$YOP+9ZhE-{n<DQwmt?#|~trPzK`s&>;V_6iTba9!(n%?tgjjpYW-7Uga
zfBg9I_K5Zi7cOvJ$dnL#YkHSaz$fj|WPiJj2~97%t=1HNG0MNYt5n#xR7x<GNnW$d
zW&51pLZ`LD)<i^_JkdBOXW`T={A=H*Q`##h=`M3@c0A#8W|ryeS_=ivmFF*>zs#W<
zu_1x&-tK2>uGCHy?wp(IaBb($&(F`V2wLjptLGfUcF})Qgt^kJ#8u1+Q_dV}<re>w
z@F#E1FPWb<-SaJrpZ!=7xOm#B->YmCza~gdk4!5n`t-T-^Ru%DQ#(o>{a=-RcyO@Q
zh-o$xubblLhP3N5^mlPM+<mTj#5C{Dj`vdUWiNQXdMFYpe!NfCdSC7DZ*vz+W?kB$
zzQ{+?I4#G=Mb@X7fn}290at$|wf<urW@dJikDudR9lrjaqf5A>_Dpu|(<wLC@BjC!
zZOM`)DP;`8lDB?5IXO9biM(bz+u;dWA?sppCO<sXdY73oj(NjJdAph)6$cs^Cx1Ea
z=lbJf#3`!;9bGY5S=j>zAHIGq-J$(iZpI`#<7Azy2{J*vIbq%pS?kUSe!F|4^F?&U
zw>LMFT}qg@O#YcL(>$Vms&=^Ek!RnNU;55rm3MsZxW_l_9Rr8b)B{r2Hl>`Lw68<Y
z``eqF`)_}bd3$^NdmrWG<!khQ7+<~+acY-{Zj=e<Q;WXYAOHRRt?NBq@9p-Ci;E_$
zG7fHT^iVA@ex#kF(5)Z6t!D$*f=#KXh2rj{NW9&%CQbH>_=Uyp{9QdgDmgj-S225}
z+)s;L=0D%==i2S}vPvEtV6>|FVPL&o;<cYk@>DAZClApAbC1I*7a92;lp5-vwb;eC
z;O9CstFj`=$2pHCHLmaNTN-ZAx8AP&-JR~pO(`e)?Hu;RH{Fvp%e`eX@u+4oLyrsB
z!}H>nH#&<irzTq(eD|Mltoph~)!`p{7nJ%6By&!zIej#ExnJ*bk6W_gN2OltdfU~1
zVEpSX*xdHbcb-k<H*uYa3vE)?(>sC=cr<Ufy8M_a>+R?B_U~QZmOVQ$F`4)OnG;zt
zC0=Q=hSP2p-21zu@vy2!+rOh743iIfCHQ`bHQ;{XdrW|RAMZ8ceY*C>O!*hz#$Mby
z>4lu*|D+!coe7=>qG!0-CKj?ZvmU(O!umAbP4I|S*E&_#a`|7=orNE_G&3$}?KvFW
xBdW|ZX})BuYYC6|T+?6gm3T8foTc`$zR>Mp>G4eWWnf@n@O1TaS?83{1ONzMGUxyR

diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png
index 342ce34e1663960d8d7050a9be57face3571d336..06dd2a740ec2abaec4919c991dd17ee007ffcf28 100644
GIT binary patch
literal 10264
zcmeAS@N?(olHy`uVBq!ia0y~yV3+{H9Bd2>4A0#j?O<R~zwhbd7*fIbb}wr|tmn3M
zKi8h&Wk0$}H~LtSxY=U^<I9T|Jmi#k#yRT&=d;ALw6r}vfAb7yI&ORM;=-~sTaW3@
ze-ZTaUftbQlTMXg{i#&v*zocCsWYrI&&)JVKR3s+_!%lg-|2_)%;Yq_iZv`5fpO6Z
zHJ>+$@y8{9xGe9v@vHMHc5RdDSe3hK_J0>{IluSlt@;16=loBWXJkJhkdd~@|Lavn
zL*r*PY>YAw8mC-*zs<6u*s|yQHQ`gmM~a+R-Brz*mF(Lz)7iwidtTV??7R<`KburD
zdX{~7JEtc1%**{rOe_jaR+qz@T9(grPv%!~;QJ!^cKzn>-?Z1=*|jLKA~5Nrkd%M(
zx&@_tvlb;b{(7*dCG+^KyP=Z%>!;WJRQvfe`{1FyEbo|t7cfXEFj$#i2tRGncl=hJ
z`f}5AGdDR*T6^)%-UVm>@h_3w<F~bZo`qHa4iPy|7Y7z@xz)EdUS|}Yd{n}Ing7h!
z>65ayAAe!%@U7rxL!M5>a=psGUq6IKTsm@Q-@9j?4E$Rf!-_xOzhAKU`}tqL9$Fr{
zmzvw<+j;r(kyj?Sbdvg8KA%u-{J_E-d`4tq!w$`LTbuK34er-F?D}$G>oxm?hp)c6
zU6WgqeSf$6^G7Tz_*|7|yDt#uo_Uhl@&dEirHJ@r%hqRa{XQ$?`##Nu7R=L{m-_ci
zu5d`Z_<TZRK?2*f1Vv?@-z^uH{dhUWV(zLF`ImCO#N5igert+k!3h(_j?Wxl{wO!h
z*G&JjAoTseTUpEaug|<^xBP0t<h1XLS}v*;?sA+n$Mvw1Th{|emLrNv4E~ZF1_s6l
z97I)=GJ8*F)%>>de7{ZQ?d@0M`8#U9mR;UgWgJ_XDgBZ^^3OGvj`R5@v)mOJ?=0l~
z#3b2$FEo71o%+_7bFVMGQ#IGX%2j7~f~AtAN7EUf&!-iX8WQT#u1ye_ZC3b0>*+OB
z@7=p6t;=5-`hHi=+wD$<e>GlydB=2Ck@3bIhx4~jzIt%h?CRmvRd1(S>@2+4elT+7
zce9oJdY%?HT$W2mL^yaD3JI{EIivikyIwxAO_4)Ff{$Uc%tjV&fwoQ&)0T;o5Bi^<
zu*BSRdfuhV(%IVe$MpivFL9{ZmNrZJ1f%AJ<d;b+mOo#5W53Gf==)o%ev4-Eg=O$v
zJGjgLNX~j?CbbTZ_=cH39p)Url6XtG&-l?6-LLPg^y)n?xAteg+<i-K-+YDx6VzD_
zF#IiAe5r>22?KvhkDG4W|KD1Z`TdS2Jzvvu()-1iNhZJ73#m`%aBa7E*;w51S;6@7
z3B&Z)XB*UwOgA(fW%FeFanAO%*t*F&@x^O<UoV+k{Un?Jq#yr{Kic0D7P7sXd|&yj
z@`N)}K159RoL{%?z`b{&a_<Yf)~>$vCQfOJbjMl-?g*cWDvW{-0uv|y|J>QAlE9ZU
z(ZV+-Qh$bY+?G!Vx9iydwYyt!cH+Gy^J0{vC76;IupRoSmtD-vS+FB-YSDN5h!x-S
z*QLbyKYX&K@BKclv(494c=RszoL}%%=#j7-H%lSY84FG3&YH4FW5(wPtRl{wEn(!A
zjtKR5JS|6Sc_y25%Soxs{O)r*tB?F+oZBkseP6fX$I%`W^9c+!$1E<eS|#qO)?EEQ
z?&_z9s)t@oirDFyb8X`WmRqiqL<-+se)>vMQE7(7k+VFia>4$#do&VvrR=MYd$o7E
zY5cz%PJg#;J$K)4Nj3isW~~MPYPN3O*?4F9uK63j28v6U9=Nul>gd$+xT{tFqlMVQ
zGx+q9&vLo7JrMjKF@N5gd6G&B3Je@+Ok8eFqLNwty!F>EXeSkl)~kOEE6bF7=Cj@C
z(o*I-FAcTBS0BFm@2A?YjGc4D4)=>6`u4vtZq?e&bya^?U2FE%^Sh=iBAv0B=d92L
zwvP($Hvg6poM-#(rhi&Pqvj)NmC7}H-ES#n-rbz?k?-Zo{*sSR`%8Z6@4D*pZP)Ad
z@(hV;471%A%+H=Dt(oDsur_J4OKrYwXY=(fk}+GWzTXadzb|WJW@0I~hroLWn@4@N
zk0ffA$*9F=X#VS2&RQ^|YR##QncUp3TfQwzc{s^(|En(coloSsE6yMOeQf1f%{@VK
zR^M(5pXGhRz!;>EoL@ch)4!-WarXL8|8`&b{e0y&Gu6KH0&3|SFSzP3&Tsf3+-o$c
z>GT1m4Bi)<ZzeK)lw(-vK4XT+cjd1O&c2v?_DK1<iMn@_etgPZx35TEBc02^CR15e
zKcG0A`-NFp-Vz^uZ`&VzKJRP$tG=hs`fHbX$#sr<4HMIk!_p}aRSG13=w&dA9r!Y1
zw@a(P@T_x<Upp`B74GS}X@2A<^QEulp?BEz#P&10ByjLFbg3Vi9CPi+nR)LQJ%5q6
z{>n$EO<(uVo%-mTs{D6zQy+Z~iP&SOB4@kaRJvyQNalfq#k-URiVun=@T^w8#-g)D
zekF5~N05EcfmGfd3}OOP9v|#c*I33TWysy3{bB9zN7novzps0S=eu9uo4$7Q_DSDX
z=8DzrIDN|8-GSG{!RDR8Oq=8GpTBI>(w!!xdv1qCO>@<2_psdf3C|LkSBD9{Y{+h`
zR4Hsw;81U3zJ4r*|EiF-cF82Wpe7Z$>zT@P8<qqgnYw^&nT#=CVZtMU&mTNCTI6qx
zSw1uF0I%Dm^?UU9{S7-)b-hMt9V4H_XHB*n;krz+ek^JIw{>>>em{5NuKjZI<|UD(
zooN-WhdzipZR`{4j(DT==%D3<<oI<w?2{Pef)6fgW)O2YEwx&zWLLvf?ga`V9=8<c
zvR&Wk{;6L@K~(9h;s5<JYnSC7>F8K5c>RHOxtw^K?%Vr?KTpYS`Z{~dwfB?e*LK9;
zetgB>&0)q#hU$sR6Kq01<S@prn9gzd`qr8sT_M)}->ug-=qc+YUSl(w<8N@MQ<dpx
zV}9bag$I+jCO!V^e(UfR1+5mAHSMCjRU5dYlxI4NMX97$UwEaE*Ky*bxoq7nvqvm~
z?9VpL5IjEn$jM12cfGIieAkt&@3?teFZS028w&-dcK_^;Pk2i%38p_U-|bUZujx~M
zU08O$QqQ@J!uN$s|8OPNi0Phh-ddnHbL-?&%TG@|QCM};qd}y*DU^|$ajk6_dyY|C
z(kY$`3~2&NX1WX48N8n;wMbjRN~JsF)kMFxY-<Gu(I#2<$RjgyHyN@R=;$TfVvc08
zR=rk|Yw`Bwmv@@exA=d4@PDW8_fNKA|1Zhz@?_LzT`>Rp!;{8$6qY+kzYt_wo@Be{
z!KvHf@{-S0#rgESbgs`+VcKea`}GfXr!=cHnGW`s2_+YIxv^VEShi+Fgm8S-zxVo>
zdRkX>7vFIYYey^2lWyXf+Xc-Ta~n7coCIG^^oc&A<ha^*<@aijU7p)&;$}C`)_ePB
z%9ei(Mtm89jUOtdZ%#_TaO0_E#pc6b9QUvM7T)ryB<QZ~^X1?5qIdq~dibEL^&zW|
zu})gM^o{=Ae-ao}6gXETGM?cvKHAmb%e6+LS@vQ3Wu93Lw%3k0F8_PPVx|+<B&#{+
z4df@V2(XGA+*cv=>GA`U*S1}pQOhG@9R958)alxA>WA}=uk)WfmTE_3Jg<D;@JHR*
ztWKceSEccVhcSNACtkmLICITCtM|v>$0t_(F3o0p-tjmzX(L-A<7T}MccJ=CxngoL
z;-9-;RCL&e#U>Q;&-(T8di&Yp5}$=@HVAZV_bRDw(0Q`sDc?T*9j?xk*tW2&_E+G{
zRJdpVGiuk1FFdJtw^r4Ay2<bPXu<h%(pCNw*X<MDeriAPq|$i8<MR6JpPNmRXFLhn
z9zJVR)Tvps#ka`p@!9xwmC5e=iamb9%X7EnDZaaK@)T!5&rTTzevTE5mS=>{1Z`oN
zYZB0>7;?Or#ayA~xYKDahM77Ew-|jD440&Kcsqz35%_s%c5bB>XXXrHmi9K0>|aM-
zu6n39*(7_y*U&we_ip#rUFMuHUwOh6`M=^a<~OhI|9W}R^Qlk6em*$o<=y>s$M@Cx
z%fEh8UCTaOuSY*Z=e)q`;D{GYFLfU65bm|u$31OPX=|vdm|MTekyA{oG>@>IvCv$z
z=EB_C85Je`G4=;_w-@}?n>4Ls^#LV;etW5hGUpt0-5F+G7HV5nTF^6l--&kN2f2L?
zO6C8;`?|mA{|bK+Fx&Oi-P>=y-s^6ClwllI8h5JQep}UV>!A1Zw^jWYNzy%{`XGJT
z$3wzTZgkIO^Juzw?8L>Wi=Mrr&NKLaT++~fGf}C5W78e=&RUOM7NR=g0uDSCN|t?Y
zW;1Kq+Gm)1bnHq@+?0~my<(2(C9WUh7EGO?%=<KM#{ZmuBK_Bdg=fD-d-K$#8qdGC
zZ%JWM;Fet)g;kun=N5&^KQ}Vq|Mx_xPenm8O>R-b!3|YEJkCx|Nr-h0{&dEm`FL8@
zhW@F+D*ZdAJ(^~{W2sCcmk-0!2!a0*GoCgrU39r{)nrALlR_(l=j2-`arQp%p76Ky
z|3ZeM6E?94nkHviuk*->Th}X`J@@qZ%#<UkA3Pq(7#Z`;iAkQm<bid>8H3V{$OYB6
z4P-y;Nx1xK<Arv<PdqQTc&uNi;GKPd&#BNSGKcZwj4ek}I4@7@cAfe1n?dQ`bsH6Q
z_k{O(BwhSmdE)=E`uCSP{ZFj6{{OeguJ(6{&i3;jwife`e6(EME`IT(tIirlJNEes
zj~t(*e6o}~l9)2XHb_)1v&FQFb;GL0fR8bYHb1Tp6ny_aIkfi)@4C4Ut@y5gejv4h
zOYzg`q)Cli3`%coIWmi>n|DRRWX2gu4`l63zgzOOe3qH|XZ`=}{OZir{ZqXw8`Hkb
z65-#kcy;e}9l!I7eEGX=TmE!5&T&qAX32S^+1v1S(w7TG^0IFdpR$~B5H<M}V{|%0
zc<PR;3#{2sII(ZPb9jBKhyHxgOy!6t)7^~D=qJ4DaP93oej&tT%|;0Z<~ioig9}ey
z<W<posF70tcK^J>=Y8(~=3iYlbKTCD&D?RvpKrNk>A$r;^3V^N!YaA7(+#~FSmv-7
zs$cZVs+b_-adB7Mu0tV)YcIUy&|V%8-ca~whU~o;x5RxdXV?ZV^<T8yQpajd-iA%b
zN*!)lUh<k~qGb1P;Wok9raL2NOpRztvN|-Y!`CgqPw~nOvokZ6rV9!j7xZ+nP_pDc
zJZ0|7w$<$6OLu&gUA;Yix`CyyjK1(}xepPN=PZ3>Zg9$;PJg}YHM8uNJ8tdH9LYK?
z9lR}U27Us}1_FJvUwH}zsq@_yOxf8TAAe-+GK>2PnP1x{PtarDZ?O8|)f**UQ<>N3
zDYA43`dIcPGIi8QZr$*#{(t)VN!hP1{k?v4>U!@Rwao@ql53}Ngi4z@rU@RmxN}4G
zxOkzE!NEmMTxq<^Sgmw7x+|FpWc#rh+o^VWxHuT(FZ<N~eE&X%KzZ)f2N?qSFC7!f
zk=oFwSlM<u=zs&yYS}c=D6@!<9+w}T6p~w|e8t7@f`iqbkJCF^rhI$1dXjIP>myB{
z>5VdloEGdHO6oV<ZXIrNUd^N}xANA(S5nzp9Hui0AI*^TlR7e8?NPwlj#q*oEgq`)
zO*7NI$iDl<0cA}ObJd$MjbH6Nn1wEWw6nU?Ca__Jz~N+Dd*82SnfsDkLRP)viY$zb
z+UGni<<SvGffnJ6H$@2zO)HxZOm;~S+>qHjHF?GIn#>uJGak<Hdd2=^=cMg>S8Vm!
zs-CtpQ7)KWiQ%}=RHq|OjKK%zG<Pb+D)=jKcJ#kka>z}2!;hN2uP^rPVYxT^^z|dJ
zrp4#0=WSK?P(I|LFvme}c`IvTnMnPTz3!_vEUVPvFpalLduDWd`Rscone{!duT&Yt
zZ25RkbahwgEG=hK=E5DxZ<T(0afxIvNoR4KcupeIcViXTvhyEPvL5Qqdbq{!))hPR
z>AzI+ySL1>&~)_qFn`%2*2ftfYz@pTn^~Nit~E_<^ILwSLVlIzdA`lZjl}tD?%yw3
zz5LyA<JZ-9s&~A3zbE{4#`C7mubPjUCUkq|&2T=Fn08!fVp^`U)r=(91G;<#+>6B%
zwpe-=nQ$qpnd!#f$vU#Ebw^xN^oa?6eXk`axozNDWBF+%m#0H<505Ta&Zh3xNTUtw
z6ijS_&+r`XD9!zvu_|xR+o<LCMW<i;9sa^N_uaBy_UAc81s6}flw93qJVSMc+`co8
z7M1htnLGW99XlL@XN#S43oPMI*{J{Sj8vUz`;~bz#=eG12^}}|1HXK6V4Ak*+QBW&
zerhky=^bH^IM1J!aXG_s)~}rZj@#z!J0AawzrUgUu>2qU^h3A*zt?_v&;9;1_U-W(
zC4!PJ8}{Zlwj4hs(~*!i-)&KPJL9L+ty`qjY+6H2x0O4r5qiV0<aq)2P32O7c*T?F
zvx5a@EWNE;wMOGY7SF~Rwf0QC4IlR?COkTlB)moDPIgoPgH-IQYcJiZPAdoQefnGb
z{mYU9*HbT9SD#LO_0lLyvdR6)Om}5Rp9Ax9gKI+<`!+NR$oBU(+uv`#rlcp|sF9HH
zk>6a~t8FK%j@6@NX5M4-)fmqnnDSiE*!c|4s~cI{vJ<#OewzQEBe?fm-bc4m?Ta~~
zd*avrKVSd-{6+TdO>gsK7tGss_DXlf!83|VHra1ZTODiu@Hv2$l|eTA!vluZ(lPf0
zJ~u??JQNcX;I}yHG{dyicGA1V(j&{9GqgGcbv`)>BwEe9#Q2o+)YA7^(ozf9+CF~w
zOAmQ$E4XvT+jHJQ)6-Y3{w(&2b(WEG8_SVKj!&w3ncrqexH{+Fs_DzxV<yu(CBAEw
z<97BN2WQNvJRy9^oBKlaAyp>3q_a~Zcs3?6U1iAj>iC#4ea+<K40AQlnOx<*pEJpL
zdfv@{-&l2*m*+nJmA$Q2{p!bkn{Sk4YB}0)@CiHq_+-$Vw$aV`Nry^?g(UOYzL{c5
zS2#nw?&vuPZPDH_FY&90w%kgc1$qkC=4d=Ru=T*wh6ctcyNe$a{UukjIsI0z|0-v&
z?%#v(#XjGp(--#m3cK8?Jh*n2|8?CK)y+E;q-^_DmF62x?vU~3Ja$HaXF6M9WOKf~
zN@o7Vvek>FHQpAQJi7S*&Ss^!J=?qA$vl|GTJCkg)cCPU{JA1k%_rUyzBICktSDcg
zXQ}vRg4TkQ)zZ<ftIf-HwBLNmeSOv1=-Fl!1)krw$8G!j`~HNF2UUHFljfM-v*c$`
zI=(ft`ctHm=W2!cSzhUHled?A)IB}zxXeVCY5Naeta$%<yQYZgH;-8ym&!ea6v7ft
zo&3tKYLMNnuq@!rsnSZj*AM+ID-9HwPEFOGY<A_tTqcVR=Jl!f^Y-t(-yyv_g1@m=
zS1sFOyU!K9Wwkv0GmbYL=J<5lRfgxF!1@l8C3PL=-BP-%FU8%ScJ7q*7V~=_?|xk@
zeL{cEa}Abzk6RC>PAF4&CLp|qWwzM@_7h$1zj806Z?ex~V0OB7{`IQ8M&Fj7dlxnB
ze#h#`yxWVPr%Lm$GqANtfBK_o<u(?^HyP#DQfz4n33`uWls&UuvVYEcz2@GQnxE66
zcYWBH#{GTi{xbb#-R|*+lJ-?RxzpbGA?A^Tov+8NtqV@tPYmU>>b#M_W%%HB?Tc*(
ze{8bcb8y|xFN-tlUb?d%aDL$S_rbTIzxIE#Z*QtizWwcv_l&EH3L{PQW@d6<<uUG8
zXjl8iA9;#zw!3T0+DBT2CtWUmv&+nn{xv)Q<STQ#FW%zoe!u@$E6*UDz#~&DJU8~Q
zwj~$qf6Mc_cC6ExWH;!U@!nW+=oDMiVX4+-jOsS*H+*EeJPegr&;EDFI^^~3ORw&2
z+O^~8lLxBH4u_{&MP1I?<Io{n!1U~ZfdKQ94{_`wDyz=7P5LTtcPTb=?H>Q~x67V%
zZ(p*x?Ed!Y-y8NG&M%%*BX`2bP{3GU?Yx4NZSD4@9FGizHt?9RzHQP?nk=}>x^Yp{
z;S|l|t+P1gUX=AID}~Onvwit*?f)YjI}Uz3ZtiTkH|+nb`t7OZ_qccMFBR-ve(@!j
zlCI$NM2j<XW;D(0*pYE=r6TL2WlQ$nF7b~o{`>j;?VIxd^Zxza|6MtOp=W-TzzqH8
zWqmh7#F|&Q=C5d!?D()$S%EKxtBx@vc`?`QD+^dN4s=P)I?$=8bgnYcdydID#&_Ij
z9^6oLHf<2HYGyGO3**#&t~P1O^R1gJ=AHatbT#F&Q!UrTkfg~AE+!tF_=W4FxVp+*
zYrXe(H!S%+G5_nj?CPJ}wpeX%uw7swQpmGGfBiz8wpkJ_D>T!N89O`J?2wt1Ize^K
z9mn!4j^K`o!p;IbjDoH*dW9*n>?eG_xvIZvJaOz8gM|9?_3zXRl=po7pLgr9wSMdD
ze7?tbzt80@-+gI{Zm(kbtqLFENi|w?gMY89|K@D@`R@H`Q_b&r{aT*C>EA8BfO9>`
z4R*83`qq36cABh^CoFezi|K3jvi4gdeTJg>WjnY9?srboXE9Ojdp#v+@6Cvgtv52b
z6dSbE+c<XI@tC{QZ@u#0YrAy5_!qxpU)~@!gXyZ_o;Ar0OpF=~yq;y}pYOf(J?!|J
zyn7S(RQtK_UcN+9|I*nhb&KcAoc=wVe@XqTv|CYo{O;{^KmDGWRbbv<)!A=b%QL!s
z=6vpmHY`Xpe0(`st)ZYsX4jVEA)oea{1)Xq*SWNTPtD1+G>o6s)Zp=vLd_*JFJ^6z
zwT>-(du>))clH4Zg9G#GYCl-`<*dDzc66pOv)=xX3-p%DFXhi)*H-?o-}~&pyT+St
zY2B?_eeGZC+9#*tE8J5P9d;Eod{O&yeYJDB(kCC@CsudwPqy+b%`8pOyLqXo<e*b+
zTY!F4>~~jF=Oc{o3RHg8yeu<T;Z})kKL5@0Q*PAj*4H^L5!Y*1x20>o3gs4j<mo<_
z<)roI=wl4a&uuL^XlS7xdU?n1|4WnK|KB50`@SztoB#6C)w}2X%KqN?Oyb@zBZr%J
z)-a}Wo?Wm?NUdyf{Eooab!T?>Zu!;wY&HAB;?(tfi}JXpw6FWV_4#7uk6Y#cN?oj3
zRe$ieSFzT<pa1_k$lv?7PA(^Y(uTKdI{oJOGYAyU*S^It<D8OT)TO9-7vlEot-k-`
z@}&Hpsdh#c^^+f^<y|gfRd3rd`EK@7%S}7l;^Ww5t&a<=2ya>BZ5uN^?Ro0zUFAzl
zzsc&qb6(Zl-<W-%V%aZlw<U)?PcVq={MK$g_ujW#k5;AInpZvVDbwE+)Z*M$yDx85
z{{I*GcUWZt7}>x6eVut(Wb5S(yxW9dzv5Z-I^%`l>?snHmTz9(q`<^c@MCMTqu_d_
zbsJUgnos9bjlVJR+AXuYm#^P>^tAk>AA7u~`jt!9)TU*ae0X3P7sC`O?J4BJ;u#!s
zTRF2iq9wzxC+FIhxs9crPl9Yq(ylU{J^a<;&x@Y()8AjV+U=TNEC2J_gr`quP7H9^
z@9@rJXNl&T*k8-F9P0cGzwfNcS!!2uX+_S{L%aR{nC;yv9VWUxcgJU5%NswJi7wu3
zC0n{=CF}Ri(oa|!8H@#HOCG;sV0^^&qi3$2{*t{#hrgSxyBxLSrMP~*%f250*5ca^
znUvH7&gpvk!DeCB0iJ^vd6Rfvu2pzzrq}DSsH5NFE;~09qe1&;Cbb!rTcQ@<JE_v+
z7rnJS^wQnmEy3^IFY)j3&fgGpm-X0zF99Z(k~0`Li@)5vfq`Rz!_@yr>({VttNdSH
zw(8(q+q}K|SC`vqJXP4Op`^{R+U0HX#{QMZuPkghx!g14qv$>VBdP2uZ)-h&@4j@q
zB0Mba=Ly}pyPwAYl3sW5zC`7W7wi9T&yJd+`)ZR=??DT-IqcINCMn!gT+4SUTX8MV
zbe1Eg3q&*iy8F(v3h0k)YBiGz+Ze$0a86{zB#E%CJIYS^+U@ClZ?9s#`RJuv6D~eA
zl$LIn>#hFivG0Lka03^I^2EDqwu_`bJ#@l)|LKJ9b9m+Fr0*!d=~gX1lTSZtg>j&-
z;bUPzUd2bBm_OZ%y`_Eq#I*Il|LuG|tMvUv%dZEPYcv14YrB9^O8XeE;VR>e88Y&o
zeNXPH3ogIwWaJpbuu?EMfVF%7nb*w>O|I@Utdc52SIrH*^sjR6z3K;_uJ%7_s=qD%
zcDK%(d+og+7vAZqjpboUXfGE1JH1|@K{I~?`<>^H&BMFD*S%eo7h8Q`FaP&NH>V`+
z$yil;#HPdfQkROX=dGKQ-rYK$_wm&3O}~DzpX~qNn0-KC2E)UGU3XVyiin4H-Vm=d
z2v9gXaZCG?+zOTJuOC?D9%1KDO%i3eq5t>Sg88wlCnhK!O=927tCQ2w%6$3Plm}KK
z=HZ^VVkiB7%lPw|-0b?CA4+bEp0wN&nWD40Hhp2w->bFz1@kY)SKr>2AKx{3@8>OR
zb!VooyM1%v=DOZJ_2DY}^>1FfJL7AaP{ps0wYC$}XYX;p)xE>7Z->-GHczi3QXOn!
zcJ<4o*wgp~9ayugS^}NsD4q55SKZz5IzQ^({{!W1?`5ot>UDWbEMAH>bbVgmv-$J-
zH|YiZFBtzmy8kDA-^sJ(@e|{2s`tyj%)e81|4;J%={1k@cQErb1TvjHU&$QOSHDMA
z<^7{+(*^E2Ke7vGluSDQ@s~ow89lFXcb!`^z8Tu8PnvsKWgX|V$86O+?{hY8Sn2S2
ztHA}qt|m5@1s#v>Z3-)k|723GYjizM!fkrKs)0?&1M83B-#P?u%L^qpOn6))y-Dh%
zOb5Gl<DS_M4{++S-*=OazSR6_(M-N2_xCO{|Mz}(<)r-2zvNdl{I-&Q@$zpVi<U!G
z{d0TsidWC7ycL+LJk<R|ax4yYX`U<J=+`hWX{ExWh6<<rqs>3JP0;BMJ?f;RCb{wE
z@u}%+>o){Fsk_rw8>cWyzcR&_k@3xhFEi&j@N{&^g|IMY-c~QF?SGx>!S{&C;_sbu
zQ)#A2HA>xl!R=Gp%x<4Mb?9`;oLNg&zRp@y`ReTQcLmEY-*IUF`LkN8x@>xsl$D0P
zp`hl3hWjrj=9Cqld$@_|VRy_#^#?`;xjJeYJ_(!Dttu44L^-7t-lj&i^PCp?IP)>9
z^23{nHvAe5jf{*r7X<E#NLy{Ws-S!Lt5M8pM#hN;=1aCebTsm@4P#j&clMomQG}t;
zYUv9cSxqZf%&;i@q@*tNTq9utyQx`khW7&AJVRB>*@~5I9*iyVcT_#wJmoiw%zd+f
zy?Tb$3P+v=6AxR+s9dNz$ZK-EA!_4st;R!%TNTzF<0^=p5v%fO;+<v<fd-bCwv`E?
z4HAah8<x4K3$M{Q@I~0RS5b*U#o@Jf@&Ze)kgiPU6~}kv_J3j7{^EnSre3~cP^0D!
zk<Z-`AzLgQx|t4f%wo%aC7yAuK5xO}`KrBLQ$;Uw2dP9%b>fKN@L1=;RdG0``RZnG
zy>6yipI%1(U%$2S*@{G0m&J*fJKlusaQ*JV<k@K0C{b6&djEW7P-5PKpZ_$H9<wp5
z4)QVf$WCYy__1`RbVi>kqqfo_#vs<?cN~iA1P`6>U|g+a;ri$LWrse?;(g25W*M7b
zkU7XB;<RAN9O;a6&mE2kz0cX#SvRqO&&6TBo*{?&;o#uy9ZEui*BL}T<F>|FpW*%D
zr0~GTf}N$~*He4ZRt*n^OrOb@qP$t39qPWoZDKw_IAg2A`H2RN0f~+Zrxt1!SvIan
zZu8Fmy!O)1l=CsWT(4D6K6mNu6<z*FkGP()YdafdF1=IwtJlKTDX4u%n9+sN`q_%)
zRT5d5cBON-+D3WHpFDK)e(_u5_g>H69M{R0xv;%UykTeK5yx4IYi|m#H*C5g9Lkr?
zvdAV?#qw44ssxF1?k*yQ$}Vb?baTAZ|E~LTe(!7je?RvxmbTwFF>P;q$+cw;#{CZZ
zQs?X@b~xpEERE2Oc)H<|qw<2HFSlI-L_c1v37cG^ovFL`xz%g=>}$K9{#tH7?cU1W
z8~2u|d*{yY^Nl+)bHmO?mK3!;iW1WpRx;-&B<z@99oW^t`Jt1=fXAgVV^YYE6t7E?
zJHK6;(8~S(#LV}5XMQ^$_kFhgpN@Y|84WB=PKoU2I{w2_PQ_S&ng7y@$r&NsCYw7<
z4)gqQNlVI7==zahC+m4{_xnvpZ$IfSU%I<)`n8p>FU9S8difsTrF+#`e16Z5URlra
zh%tXc(-vj1hL!Dy9647Q`xvka`WP^asLUufwVOTZi`*s4-|Oa9{pG*(G=I<g{r|t$
zFvu_{FR-d{eziyHf2dnp+bPk)wxw)4leRjjJ_tQ1;(2_F)$Vmy?<u{HSC8GJJNId{
z-P`*gDqk~h;1)6d+#-J6rkGi_cG;EYExg&1LZXWnD6=cIg_|ao>?>B7+bg}P>|gHA
zFVp{ceLFV)7lXu^FJIQ|Kl{6J-#QBco=jKa_DS2yPg>vJ@>~95+@F<OrrDkO^v7xg
zyU+g)+rG2Xr}mxWc))wzOxbvI**<OV=XO)hz4D(tb@l#9bE^M}Esv9x*){p+%5J4+
zvYnq*c@Ib~P@A`M$w|G)ZwlYS-mBmL5xuSPaJkX#J@Ogn<}Xx`%DSADRcN*G;92Fo
zNzd74C9lf3d?BRQZ~Nir)m{mC3o2ELvRZCgXcaS`EmU%QvNJhvfn_oC*_qNA=jQBe
ze0GM{BpoVZY<^+R&PJx<XKcUzurQrFVqsMLjIHaSkgKn>X{c%N(KB_+gb(w~n`aVg
z%-yy!QrmD|_OrhZY#mpZvaMAzbV)aSrCeY?+kCqI`l-wY#yVO;TclTsFZiD>*SKc#
z(PKYLl(d*j>-tsh9^T36#_;I8+y~ojE52+=3^+1f<)^@OUY||2&p%(xaGK-3K)mr@
z>!r+%i@CYY@232%^}b!Zy6f~2E78j=3_b$S`)ohW(A?3sWwZaSyK2!tZs^8W#Bch#
zJN1(0?_y@#?-KPF<+X1MxNl!~-{;ESm6Mq-?p=HA)Wk)J2Ti`oOq&tarl9a?vxbV2
z`@DR1=NcCk36V#W7Cz}Vk9~iJ_vM@_ck};U{G0RtZ(Lid!8lv`#ZUJ}h3>ny-QDlK
z=lRF1{(fQBE9cp7_W4-ubh3=ykhDifjk9UCq`O`G<_IB`*Hf3wRoqv6db^n2r6tAA
zR;RD4#{N=x|Ge3`xk;a+m^t%({y+6^4Tc4?ckMfO?r!hP|6la|zZ~3pr|g~kvc-C7
zO~nV!xVW44uk6}!Y}3Cf=T^o>?)kAU=2v%}?aAruRnzws_g?$-;R4ICI}Z8_CQ2<(
zI9=9o?EADj=hw#kbG*-c=fB)Eb+T{fs?Aa2-j_mVX8JhDWGST`P?YVnILH3k|K6tQ
zd-kio4K821xvc-(%hx}yxqD7}e_@hhz`g^|W=g;Ku={JuU(bKf|Cv7CGm(Gw>-@OH
zJ@w{yYJT6|R%|`>VqV%pw%(?S12PP43~X%6xW0KFxu~*!>DDce#pA2mpK8DVy>#{a
z*Dv4S_p@GGyJYMBSh<kbI#V0=?RYj*y5lfgQm$_z?}9Vk%<|th3-HT@9jR*g!96ME
zb(qifH7l<RyzJIIqF^Dqhr=^lLC;#@watgf_Y1d$y^BbH`*+FqveWuG_dcfIyTQn{
zAmaUeYa5djcO2q-+h>Z*U{Xyto3-oqO@Axx@~xWppPg%;{^jA@=_UW)h2Gxxxy_{2
zxZ6piRi$E&h3D?*OSYxHRkJ~z-rX-RO|`!G{Pw=7{4eLvXAoWBKErIT^ov&0OQ)_b
z+U)c|$<!e0Nxk0Oqk)UR?pwW#J<Rv~#HF@#vL|lQz8<He8@)^O?N#Y7=5}Yy_Eq<u
zu7B#sWc>I7!$*zJZ;P4t{%|vLXw=Hpzkc=5D+iUG+JC=hud|D|{<7J(cd}@t#qP5!
z!?(>jE$5Z}apAUW<(W^J7g(gsnQxqF<$k7Ty@63MQ;Gw3M9#YZFRy->{r_L+w|8^W
z9xt^%IsfnWy8-jxJ%26!ZQ+08$7iOmLS*6N2ktl&KVzF^49m~b8RtMEfBqMAu8-kv
TvkPEgU|{fc^>bP0l+XkK8%&jv

literal 7279
zcmeAS@N?(olHy`uVBq!ia0y~yV3+{H9Bd2>4A0#j?O<S#G4phB45?szbA{1`#Zlzg
z$Ny*UT`l7bSLAXyRLIAARI!1<fuVtgQ^A3OONhyVK|$eRNPvY#Py<60`>UAk){`T6
zjurD2e6Q2Fx2E#<n;N#IQ-ZuS!Gx)3&lkN4okZJ{Q~2e?uQ@d=dd(S7x$CvuxvS6k
z)@;pY3wHIjd0+c(&+j+?zvyo-ixzj_6WJ&rsc2(-zW?sxr$?6t^E_x>QMpjBv-+^+
zp<B&&;^X*#A9?*j+R0w|%Y-9o>>mzoKmKcdkbUAi!EZIGH%~q*doA;W*GYv<gE7r|
zZu{QeuV1qjI_0!BM#YK$I`L!z<EP(+&!lU`9_IJiy{@P+nbEZI+{%jI75DSEm*+Jw
z1_-QRP>f!8)@*j~W)olGD_gw(ZDzQ`W<H_0qJbxXHS6%hf>HyC1yi<`iLPLDR*-+n
z=JnZz-9huYj00B<vzW#b=@m>Nr+uePS;ZuJfWd>&$RS9Cm2s)i0S`v5ojRxEUVW`<
zUb(z#?>+4Y9R>~=Y%|!WHHan%Mrd5;op48feN5#kg?|^*<SeF18w#vV%D?AYa!&nY
zX$|xJLv~Aw&sp~JIPz-*D46uI*xk@E-Tl#C{>EC#*(RsAN<`f*tJp2%edOvVvo96%
zuAGwD(;m}lVU_&tvE*liEiBD;67ekwR|GGVl`FG7F-V*D@x__<zsic|u%+$&F>SZM
z%KY^Qt9GlOt2&vyU%Bq@+y46k`O$ygKDYLa_xcraN&L4hN9`L2^E;dBCw<|)b^c8B
zq<40oQ#{QsTzMyL@}e_#leU1U0PDOzQySMD6o^<QKJ(6#1?e`<ZL05QO|A6eYUwR7
z$g4Vi_<7|4j*@a~p550JnC8tsP<-aD{nq0)w;NU-xuqV<H0gd>N5Vt*h##FhY9Deg
zxWB+h<u0#7RmJ~xjVYPIYcd;GRPQWh{leyzpp>`oaMOEPebN1)@;y5j2Yz{&H9>xV
z>AdW{8Zo9TBKURhy>a#V7+G&UFL!q8f(_Zv)jzD66#adnUsOg?Y^}>(Mvwi1{}22S
zTF|>>e&hr9XA%?S?04)A+96}Ucd^BVH*6NR=S|Bu^6_SW&vQB_lPtG8=2-fI4J9Yj
zL>w<#OEBtel4NF^7%H+~HuT7o1#eqz52r*kde|h#iz<9NyOnRoJDtKkCmlFU7$+-F
z;A7GGz~J#C`v25KyQ9Y$J^bzJUvK5l<uVePY|Zz5W|@E1`P(snSX^H#<=*Gh++}ia
zrG)T=%a3M!oSZAWqM^jjZkaWARQ%y%i-TX3tmlh<A0{H(Tev?vvi6;1)avqM%kS5G
zclEr_-SFei2gQ9A+^;<iA5~{MxH5h}(74k1(mU<OL|*p@)7<Y2&VCJ<zxfW#(SQGB
z%hqSV`P}^4mmYq))i3*k<PCcVQP%Q?L%UzNZOdT}xtPE-<@%$UA_82w8~@4QDZjOs
zYe&yBHs0S%x_{sAQ25_8|E<)5?A^O>Z4@?0QR!2T$YISteyWD$VrJX!D_vz(eu>gH
z$_AV3zJ|5whROHqK6;gJHT7el3iElT&6gs>WMAtjoWA=s;`<E0eUqcvj`fzB>s?lp
z?@0Z2zi9d$+XZs3-)5Kmj%V0ao?gmur~BEhzh-y3C2sfF-ii0>wB7r5ZvW|fx*fk|
z7VmVtT=FP;_Wdm@-f%8k8{2mG*}W&BySyyJie`$*rq|EgzD`NBLu})TyJilb=DD|@
zu03v+VCk+9+g|nd)_?vBt3;HKPcfReCtI#RiZ`2OLgoBOZjsGpH!CkZ-~ayykI*@$
zri`A)j7!C4{g&yIioDpOzmffO%yLVqW2g3<NuL{Z?8pAhrT>rIIsWcrPUAZhx$B4c
zU!S{hhpWH3=UVBS*ENg2M>Lz+Wh)%7`ThRl@$05DukSkbd-C35hO*R~hd!9!wA<GE
zx_V9e?QOYW)Sr8Mo7tqwAM!z~)mCqMo?7*7@6`QSSydBK%6{GcA9eS#?a5P(lNdjW
zF8!;lBtPSXQ&WB3Ylj_orMJ%AnRMevsKqw6q(6%FT<THk^RxRr?_KcBJvL?ke&yei
zj0~T&Zdv(jZFsSd`{v_sdz_TN$T=FAZc}Yey1pt`UPD`6MBZTTb7x^bh1Dx|f748#
z@l!_FZBopUJ@4M+-T3QWJumilfuP5;{Eg?5uGhI<JJ7j@@!pN|CxjP>D6U;;?z4;k
z?zM%gOeg;L%u{_WWT#LTziWc1L+h>fbqkD6uAFS~{m<(nH@+XY`?4lD{m^R3ZoIlc
zY5ly`z5AQz+JAKpzPC=K)c8v1QjP8qr9I3`m(M8?um5TB<{z(4t@)pCmK$C@F395E
zu4K7RxKv(*LrPYPb8S{smaO=L6GawVMEce@p3~kIEa>o*Q8?j2$P076TWt=(>o?iW
zWz#5ibT)Q0{i<m2Ze6E&(p<-kyD~e{PPI>8`CDOw)Jx4>MP9Y)?%`RrUv+LYCM`Hp
zeS+5@$BJjK<gdQa^?Ck5_b)~{y?P{gNUOf#3E%JkOzh%^CW>CDyr?!WD4CmQHly<M
zgC{JDX59%5|Hlw($Hsj><z1ua7T!sxdFE?YRPirgXXUUj>es=Jod#8mJu{fk)_x2P
zmtA1DIN?c-gk7}XWh(~%(yNR?7E-m=62;Mumv4BLu3F7^VaxJ^U;D#~cQ0RYgHfFC
z;8Nu*-m;j6q}`MLI4PTK(b%(%ugy;R=`6`|-YblkVpF5THe1(Mu$=r@T9BkX=jOsZ
z=JlU-`%Vkje6{7>E!pN5eBH@ENM?4CKidhLUXR(Sw`UZsY>v5FC-?229N(YFRBI=}
z_ianQ6+c}v&AHojQNo|t#6PhMR`UNm{kmn#@uJ;R1(z?k$x>j~RC)VPRHugd%@4yZ
z^S$M!b^TrR*3Q`W=c~*czifZpmQH*xzu<~zfB2gQQMLtF#OL|1|Jd5R;fw8tm(m4K
zw=f=ddZOof!E?I*fj0evpZE_fP2Slk7}a{<nq-{&wAXHLUgjQi<TcqScJh$QOBwFT
zmd}r{Mf}ox8opEW+q>D73~j6@_)`scX5W`jxO)7`rmkdez9W1dX`U?VmET%rGLEvB
zcB_>it$V@s)PZf*mW^gQHm~=yxXzwzalvHEp@vR_ta(C@Ilo`1dJ^-(-qE-Gdh4kx
zSNUGs@wbaTeQI-e&2o!cZ5<bHofUk4M6cB{K1w=NJTp|I+NkB-tNvN*OJ6(WA8By9
zvQSlO@{-Pqsm~iqV;fE9i&ZE;6LMgTP6$%uc&m8#`IWGU|Eycve0CTmdY6C8|7|tn
zdyz!7`@zR6a}9cS8|zevo~d7&_bth34sUjBIP1lmwkevQx3hI~wMwP@%B$ee2)%Yp
z-}$)e5(`eHuUhA8%+Aya^6c7nd-1m?QRWBtOh2?``KshAhl1nwRq>hCx!T@-_@q5j
zTbBO@Kg(6&e4+DawmeoZ$lASAE%=?a+dIQ&-%KUu++%qBUXA^=?U|5in)31d)pc{%
zbEgVi;xc_zT%e`vArTyZ{@i?q!lh!<A4Goek}&9;mGz<2;D6|a>>2j@b+5l)ZI;#Q
z^lQ7h`uLq$AE!P0keG8mf8l4NNv#~MYa`FMZd(}1>wQt><g#rk6%4n|ADVZcl|OzG
z^E6xa661SsxZm1k$-e!s{GwRLh0FBvd`4!q$qP-welJsRoK@p_eev%CS@Gfm#Ts!X
zzWZF>qQ^GPNGmE>D0J0ps##X!td}>v*KT=etaWfrrdRv)FAF}1+O!<K%W^WJa$)B!
zt+3V*F6mY_-}!t?S1?R|$1OPZ@XLiuC3pKv6@0lhq5N_1o6`>B{fAEKp6twyxzus|
z;B|qJ!^>7yE9j}Je^I}tEq^M=&cgD3uRr4?u>~PZmRargp4-NB%xXg0*;9Onc3pUJ
zTg_rZ!=leZb8fnteNsGW-?^>uYMtjfyU-$W)2pq_Q)iu%%C;@B+L>{qq;blhy)SlU
zue?!n{-o^ey1%Uci-LCqoGpD>@l=2H`wU)tUj46@U*+O{&VTi(Q|^)7!^s)yoS)}v
zGHXuXP}zU#)s55cMf2mFeSc`}KDRpkgu}keJHCpUbROf6Rh<0l-07(@8t)Ij`}N0*
z?@DFstkdOE%n|o2C-@gzh--;zTL)ZmSf^zYR;J)})AZq1zu<UA*%SLa3KZ8Y-1upF
z*c*lL(yihD(v`Tx5B*85eC`nHF!`<ke`{w7`=y@FKTFR0mo4ad_ILLC+vW4`6f5Za
zSXu`5UbVVV<C*%jyFH|^a>eUacJ@DSmE4Wo>A%(BsO579zx_?Z)3ddFO}8b_4!tJO
zUtE2{b@}!~sm3$*^Qgp@r!73)?s>fUWaFO;Pvc|E_^y=K+?<|TXT3V)%9eRG%Y<iM
zNL{!7e@l<diT0E0w=?iPiOZO}a828#+I7?ZnVr{?w_8{sRQ2o2K6~dmCYpio7CYuf
zYWT&w3)r+QEl|x`{jAO)Y1`j?c3r76bszFsrW{j?waQ9lQ~fb(;l&G1xuIKc|G21B
zVSMj>?Y7r%jy>R*r!v8$eD7x=#wBvka_xOs+*7|Ov<diDY!A2kp0;Q6xBkH7%mWes
zJ5#5oI9TnKHLLACBRbt=>5D)ASPGJ+w!h-sEU8gDw<Z7B@vCbex0H%yWwf3AZW=%B
z=cjEo;g6SXo;+{C{?Fo~>l{;g&9o;p&UwH4SLqBHkJlpNb7!&y+z&LXY20S2p8eF_
zWVH$R+uzD-{;3=}YV_SOG3SAZFY_rE&HppsTKyMzc00{2$jJ6@q=4wD1BTZx70QY2
z6)pR>Crq}7DeZOnSC_tZ+HA9oCUE)AIy2+y<(o;DTJ(1F8JOSmGYjAN>DHWinl8fc
zxY)(BnB)U*KA#@z^V~J!o1MkZKx5NeY?FhkxQ@^Af9c3~tu{Bw`su$@SB*lrD&Bl$
z<4v5`YE{o6sns;|e2{u9tN5jTi(+1wpKm;S|H^_b?_NAgyzxQw?~l{>?%$q}de>c;
zf4S|{oTmqv%VHPWZ7x?hUi#*|)h*#Ir(*WHACSGzSzl>7=l1kBPA`7#sNe{zcQWXi
zv-sM@1DaL+(-@`t0^hWJ+uZc*jGx6mvA%U}>rP+&YjNs9_QSsS4c8n!7-deJug~*T
zJab5-MalN@^K<VT?f1nb_p|nesW4nEWc>Emv%9X$(Ku52AM*uPhrF<>3VNbfZr3a?
z&%Bm(&8v<lew(5WtHZw2KMYn1xN#+y*NN-YaL>7!sCC^!P^@8+xyX@nIZyxhJKjng
zw;ivm;mRoDPZU#=S7P!yU$pPId*VmyFBddlyNIn?CHmVrTJ4_jv;AIr&ouwv%c$0g
z;rJYTtWk4I6uTz-&2P0gzLcI_Y<@w)yy!{a!izdRv*+;y%g;DI`OWHL8TmC0iaKkR
zTbFx$KAcyxXP(oxcb|XVy_%%%VkZ;o`T2apr`iY4mcR0=*7JIAv9sM^^Lcx*$<x1V
zo>5+*9rK;z##iqf|I>HW*wk{oHV!g(Q@`u)`?cojf#ic+Vuy?xR;XvZI2)B_E6%5T
z?S1dO2=?dvPAgB(PraDGXWl-;qJ8szT)fjhZ(-B{;Zuzrmz<WAo}a+B&417IX+h#K
zHa&G4pZB!YA5pcvyLjXFv>o4Mm)b2_(I~g_zxccY%K#^)S58Zq1Z~5LrWZ!#b#|@f
z6R8g?JfXOD+xJV;=028OvF_Ew!dC_l7W^s^$UNy$y0qQ>>4eKwmz+YLtM7I!ZG5<q
zfwf`96jjI4qf4)RXvuHYoN1OS+vc@)v&Z+Pe0pAc)+Ur${9au>cm0ZM3zh%=+*rJ4
z)%DCf+wTh<Zd9Ki77=u_D6R1l|E7<OZ#$G!g4<U0XC8dF{dY~qq7`lS@1FhtGxcuz
zx7oS>XYlj*1>e<7X9(cx$a$`~&GY?q&jNjqGRwIcug?8>u)cO7tJ=D&n=4kapFH$1
z|N4SAhbPx_d{%swCMmS=lcQJDNrtB_J+s;j>$jwN6`YWp_s4mL?(PPQ4Hd`dy!A4O
zbXz05bh`bCGwsL1O1KJgj<&a|i#o)rGBA6jbX-`bIopdpxGXOJzH*}V&D<pWN<aRI
z?@hv99*&4&|HOFOxTRDqG*iz-jP1+|*QvYb8f0%?P`>3OcL!Iih-cB8xv!ZgGp*YZ
zvg!A!o-HAVH>mw|m|4WZ{z~JPT3C##z_upVM!)_Xj-ojl>;4qaUL~_$HLc(Jz)puB
z0>7Bw&b=P8dF^|vzQ-;5m4p^tWl~SITcC5vi7_MNeTeGAkgAXs7xw5KdsZWQV9)Z#
z4U2ZW#QiY(UE6AD|IKl;-qYRf;XfyC(_~&J68?K0uY+T^jFPSctAI&V2IH+;yC+OK
zJk81HqTJ!MV`^tynT{BDEGiV=d#_kJwwpcrm*3H>PqM<(YQjI=*T0zh=X$h5M~SYU
zjiH0|g_XOOuV|e#?N#j_g&i3}7F%q-?>s;E$Dil+{~!Ct&ojGh(X#T-iLhyfqQM8v
zo===qq{z<o`PM6^>C>%$PV!#Dk-Dx}CETE5T`=2<`IjS3<gA#SWHo<J;@RW&|2{si
z|1^329p)|B3x!{JHEDd5)+?TK&mn%n_kFd$?d&ryEZBLaboKR@mzRh4$lL#Wc|i3+
z)3q(9d)LUT?e^dNAW^~K$NRU@%o9{SMc(|BTbH)CdRuPt+q;|BKDfDU?N(5ytGeg>
z=|3N{7A0M6e|e+pcAcQ4?VI=$RZ}A~uC2K#IoGQ6Roc$NLq`_(+x@yQLD9KlVV-?K
zfQJa%qq5x}tvZ7md!n{xUES)pqJX(JuDw=Yh9e_pqv%CVsXZ>Yx8<706`wVo>}UV?
zi+1zDgk4|v^O}2kdvjl3A8$XscAex35ni6#nmQ`CmKz1Io)A*|rDN=*tNr8k)z#&1
z-)_JE&qqPPV_ol&q>Y=N-`bkJyyV4&g`1b$ThXf;dQ9=d%jNU8dHeeM?o;R6=-F~O
z@%7v7_v?bH!~e{9I=S5T-;c-rbBfPdns2PB;Xl1}-efi3U&&`?7#h1SnN<?e^;Mhg
zQwo>0LevVEQ<>S1lT{Y~5ndeOvh{e!X-lVhJv}{Z)O=@cId|^dHk*Q=iy0<2E#Le&
zbm-8cjmgKq9l8H@@~8G(tGn9F>U*ocz6z_V+NGp)|K!9>#UdB(DIU_1WkL<9J3gJ#
zUcVw>q0_AEuXVMR@88~0`1qP}+8K%9fPe|hJyK5Wys_g;z2JoPoEg70ec}WryKP#M
z#CnVA@<%??UBU6yZ?{(aTfg0s<FooCU*dBErHLI|3mzW&7rj02t`5_N2NAsrf((}$
zx~6J{ZgTLO>k#JY*{S;e)WkO4+Y<~qQWo$`3|$@e)p=UYj7Z<}PNLdjXWV3^_I^HR
zZO?sjaa+nl3xUa2)3TV=KKgSkF%F!l-fE!mn`M?+Zq(OTtJl{lXlr*zE6zzewQP=U
z^|v>F9{1a`d9}a4%rRwyVgSpB&vR$aoLLhedF+9R%1!lfx6~VLPaI7-{{Q)W-qHL{
z!Qr`~t3p=3{Qmo=-z1Gr?R7gIH9k8#yIjtu;)C8))!rQoHB>GMY*(G|Pw4IZMN*kT
zJSXksGP=VJOIvSjJYz9O?#4qEp%T~Qj~+c@o}^o}^s%xT3)iXL-ZM2XUAk1lr0S`q
zq7(L7$?R!}<oWhZEbhHhS0A2KpI@Ub!u2s$r})!e#UqFL?f-CbeA(d`sj<+O^?#E7
znH|*@k2v=IdbN6W_O&%PTNX!l#<0zoHqVQ>vMzS_N~bwC3r^(yx^zn^?Q^*yZ{Y4s
zhmI%gPH%5-pI`QR?RGt@?{|vzA5K}#abDdde6s%jKc8lW$JdrNnSDCT&u-HD_$GU>
z_bXTN*e{Z7d0b9A^_r|yukk4!ImzDDzu2Ppbh@~A<R^ovTeY>cx+cC*;cK;T)R|TE
zuvNV7)Gp46!7)shlWYInumAtoSzx~SCgv#;sRgSV3*>mS_qcJCocOf6I8m9|YpG6U
z@G_rB<NAMpE<SoON#&7(!h=hCDR)(pD`T0a)?3#0N|~-IIwqOE$5ZxTz|D4*Ced^M
zpUuwS#aYSK=l*TuNhh6Pxl=bx&CSi{x1@gy{P45p+REkgex-Gb>28{Ou9!2#ZeiFF
z=6;(`Css$yU%4t>UbW<Szx;jSX_eOvH}^9}a-`m|&AYQ>-Wewy>yw9hj;J50zFPf$
z?|0E_p)YNxJ3L)d8&~ym>E`U~>;5j;x9D_&`kQ-ut528TuQm5MZKrxG>*Ghhrzzec
zVojV9yNaKm)2$Ly>a;LO;G5Sn!>+xwj5DNen{CDI-0ihJZ-f()Umu#eZEybleYu8<
zinqKGHamUt#G*MVic{>j`_H$_O?)q`v?5e&(OUk5voDl$BNdOTUZ}sb>Bqx%`B#Z&
zm&=tl<=sx*pY7K#XZ!3`@N`G*U0Omx%Yvr1Mm)Q^I{f@o28GHJp~Qn5?`^+d7u{cK
zzPI0IS^9h_)2t<P%kNdnuVl4YR;xP4uu0j%a{29VO2=C^GjE%hdVF5&3}(meUt{j?
zdeFpecj#BZul;A8^cr0(>OUS8_dmF8e(8$b=hus$pOa<F*>c{lEmd*mseo_1o72zB
zt-D=y(&<jOHFwlYP1O~#On<p=*Zu$d{kp}|DZz0U`?FSvK5ynQDd4}m?e^s3JVGzG
z%`3gJzyAM0XI9hYudPa7?Ws9$`~8Z^&1=;>+uk0WrW?JF@nOtIk8QGHJ8s|GTm4<u
z(PFlrLTteE&3qQ0F8bcAx)!?mtLW`5nV+q09}0<nZy0K^|Np<=;*TSqI^QwBlxw~J
z-oD!3Rvl*o)oy*OPkdb*IbAn;+nT3r+9wyME52CJZujTIVQ(MND{{e``RANCuF&s4
zmo4R*Vn*%m^7r?qJyD!epyz+vZ<8G3)7;m}w;I>lTv;g_c3D_O&{>jIp6RxX{LERc
zHfm||a~@>d6+Sv5?cO7i`1aP;(@9n7hd%{ezqHhQIj@vS#zeEysZ+LpdzdcEH0e60
zZpb1Q)7M*4PEJ}`ked3u_1oLq>0e)6JuPKb@}f6KSa|b<<0cP#iY+1wV&5O!R@{5M
z>FQRuKAFs?PoG9wT3R06|Nmb#`||SJg<B3Cx;44=tncf`{r21Jemr32e!fI@``7({
zzeT%qyh?gAH8Oa)-_`<_BZs0U9(CBghdo6$I$$>o*Mk+uJ`0>%9>A=yRK2nIu%}Yu
z*Rl_vo}S)rvG=~}tnLf@%WCHLl)k>Z`+EY9&^(=hrdgLI=PysNeZ#tKqW6Nu_d7Ta
zR_-~_YVt~MlDk#^%Nl;E?dKH_Sw3ocJ?ZmRt4@yPF_A9!)stltg}!Zgmo+^u_qK!Z
zBnQRGn<gCJVynOA=Z~9_KY#wbVwXC%^XzfGAHN#5J(6L2<R|E%KJmawX@Nx56_M@&
zE4W{uKmQ~_X`+hoFB7JvJ6Npl)c!0`_!phgU?3g9qGNoq=+4>s4yRXd>Sw;CzL4RO
z9K)1~XOmOxR{Zy^%vQX6ZS%SdPNy&O{i!L<ROr$9qkFrvfV1SugFE&@-hcEPJd7WD
zJW{x{&3Crh-y?0dCciC8=dwyRsGL*m3}k!6aOZ*fpSj8NSEnjon_7CzaB8jrU*UvI
zv2|>FH{{*jHF27>``z!S7_u0}nYS~qZK!S#J8<?ur30_R*@b%*?E^S1WOvAA@YyJ2
z=-FGx{yX+QKzfO%?D@%7+b5W1sOA=J+!ENP`8(O}@V5@H2Aj80J^UQ<i5CstEm#m5
zUS%63=Qc6rKvL`u>y+2sCKF`@uZxB%MaO?PN_{e0*r-+Jl~3Q{yxniYPfDGgxSm5*
zW?$5r_jjJZo_*|C?rn)@N1Pm%`G|%-nYmE+skQDCX9uwfydmN#r_Ve2aYy`(`l6US
zA#)aU@)8LTr9;XAg6cB+U#zI#=J2~lmect50aby+SGIXZ{GVQMnAiLci1^2(Q*cvD
U@o8}!0|Nttr>mdKI;Vst053@Zg8%>k

diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/logo.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..b94bcfc081e0b036fbba271d7cbfb986575d4abf
GIT binary patch
literal 23476
zcmeAS@N?(olHy`uVBq!ia0y~yU`=3PU^v0S#K6Gt^3nZu3=GlLo-U3d6^w6gG%_$S
zFfc9no0u-~<d6m{0|Uc5PLMhW0gyx!h!9|5U|?ua05Mq@85kH8I6zEC5CJxl4oZ{X
z>&=ipzKMbBe^27EC6*hV4xa%7o5yKsX>6zYf}IyA`kW0&y}2nh^24NC8I|FOg|$oe
z?RxsCfxGrkNqjy>`_(>&O}}J|Uw@7|{piHQu+#vMR~Q617QDN^=z5)6SH8feE23vM
zaqIhr?OnPpKLF%C1_c2gwu{cZQHNDl|IrgX^Uja?zv<1ggQ+Ly#;;~yB??x<mZRWc
zA(KBVIkzt6$$a&ua|av$OJDpuYkvCl`>U_|_sFdZ2dk6FXlyXqV4ZMfPxABlM_U@=
zWfi@Z{xc=t?*6##s*aijIDi`*7)t#o27aEXvGZVU)`RQDjj<Iw1HZQ2+UN;(Z-c{y
zdRP9d*`<@;T)KVa{X3?0J`++!?=QLj^D46`D8?BWSepK+7A=X*o|heNcSZEn-^Z~B
zY%R-f-i<o^<-|iVPz*CP)CqD3WXI=+?adR9`O>e=Q=QVH(|LjQ*|zXUveHuuBEWVn
zIM2p(<eSK=t@7n!@sD>M+OS;hw%)#5EB-p%4PFBF(uM0xjEmaKUX~`my=2{T=W^fy
zQ>Sy=PUT%M|GAjGTnnt`ORGb}iHFxJ9+zx><S2N`J5;2gm%~eGN_LptD$}!FGMT|(
zWiO68G*sM{N`Bp~RI<-J_JHXmjRG;@Be$+WP4QDmxa)A1z3ic=Q`uVS4`nU}xAk^u
z?flGL>c0|{UKtw98yQ~x)BSZ$e(fe}7yUBE>l~J+wRT$vnup!p?+f;jLTMwz#KXED
zk42u_)tE4k!zx_j+s275FKjkHx>)hK>BK}nNa)D%vs{=pT`GE>@XzPpcIrKtcj0G!
zyw3(VuptZBnHgWR_1x-z`^UTG%^ay0{K}mPVd)pI{`$6lakVxiO>nT(x9-}mwDYy~
z>MzgMCHUQc&-6}kq0Q<)aeEVkO6s?QUChw<pubCI)pe<5vz~tM42ZMWVtOZdF0|u$
z<m~xf+t-F1N(4KHf$89TA--(g?%uWMOIy$RepPc)^j3Nx%;<SS^S^7^!>1=#ZBLSA
zgsENM{gzL4^*xcF;$^I=`ulF*pOL92Gi{E^ov3*)&u-;d^g^okcloP-YoUpS$-MNw
zSnYB1MNili)0h7V-lfy&YnT1A<H;A}t?&K2?CRa#oLzWqO5isu{XU@**3WM~?cUP1
z@}l!ssQ#w<jyLPCzEYo^`n@w@oA~{Q>?+w$Ig-Cc{`wL8Wxk(BP5Gi<cV@m?oOyBi
z<;U{;=4rh`(wl57d=);OH{WKndh5ClnY>^&Qw9)dRFJtgOMK>3|BgT0LI1AS25e%s
z%g${xw2z%%vFe$<<M;VDY&+lk{c=}$G4I6UboY1<{~F^%F>^G{JXhA8actT2cG2m{
zSMy!Ji)b_pgA8nN5Xc9m@^?!Z`OO5`RI8nr?ab(~duj4^x$u|yb4)(`Z+u{w{%;~@
z(t7@@?)@9W(ztGRhjN6S)IB-Ze!~oq5ex=9m>PF&cUu2<%jO?m`hK5IPuhDiT~B(M
zzpeA<|Nc@1>`nFlUoLLGdi-+Idp^@;{hO};4*LI~`HJW%)6?&+N~ITxDuK*Nc)-q7
z8b3A6{G#6O&Dv95AGBrtrZ#_TQj6j1HAiMlp3}^H_woPO#sf3m=lTfLs`>t|nPF*u
z_}-JF|C}b)&x`!(wWic^CdeQL0|(}V55v}Gu8aKH!7=ywQT1HQ({Yd2NPG0y-<--;
zrz^0-*zw86%~!=Q3)}Y^Oi$&yIa!&%HC)P6er2_n2-pa=Acc<7oGYvEZ~LtO?a?Hr
z$W7PNEnc%No3rkxf63Cu`{EpTW<L11IXR!-*H0pI{kP5r_L&ODwRRf^nyc;F?*+*R
z``MXF{b$B~PE22@{+g|6QgQ3UO%G3IrmMwdJ~+?9RIXN`r2FrpLGm|^je(z(%2+nP
z>H7W1xA@m<VRo=1m>j=2^oH4A)Y=()B5sb90Au=HhoWndog3Gsb5&^w{Q2F-{Pm~F
zbFCw+GnF1Vs!g!s)-zfi7ao0G^DnppYH$#+y(5x8FFnjg<J0+%FBbkaRqn0K?EJVq
zW1ZfG!;J?dmkU~4m)SFCUB6_a?)Rlz8vcB4I`&lA<fmoo2BwAri<pp*YkyajFkgsU
ze`SgW0|S%ed@;V`x35dxj@!nq_ci10fi|DyhXS>iGH2Z{&027rx$*a`=H-_jfugxr
z=(Vb%w`RmvjYw^Gsm<rE9iJ~1{jbAFgYm$vt+V4Q9=?^%+p&F5jq!W76{!;^g|7NP
z^TZ8t5eA0F1?Rio>R!yYf2Fg}>%=7U%mXoIKJ%;Eo|Tt2USVS@SM#`TC!KaiHZ^}r
zRo2xejvsGK7x(<k?BraYWj}YpPyZhNeGUu^n_@oP%6|X5X1YxEn+xx*U%KR#D!{CC
z`;+qJdtvdm_bt!=_}a|H(BN=^-Id?;MQQTdNwZxIZg=~(Y)V-#GjmSiG_zw{J1;DD
zcu@C3!M9d1_J7@utu~uqum2|KnKS8(>$!~`_4>gQIgJbqUS6l}*Z=Q+^X7EFb>0iZ
z<2;tXUX*xBWi&p>e{+|)TID9wv5)>jt8c~K-qstDe7o$p*|CkAw;g+Cy6fD^cOk~&
zSG$Yfzf+h|k{|l%-@MNS64!rzXY65|V|~m#UzJ;K(aJ1ESJ$N*U&hKYFbHrwnZ_2g
zd=+#2KI1#(#T8<FKcai@SxucGe^~8&5eLh`QkIi4o)71I-ZIZA_=DCFyW17tM7~7)
z{mmQnyWoA7)UAX6CYwuf=*8^Vk#=@l`Mr~2FHW49bNROQ0p<fQ)<mn{U;n-Q{@I(m
zx3}(Jw|?KRvd70_x8MD?VoR38E2bb>2Zb4n_PtrwYCPxLn$<l`F%K_`#hv4;cZ=%s
zc*zuH5nOe4e!l)g|GMM-=Uano+%04O3i$@I`+NBXDhe<#C<vUG`1+6I`|Fm6o4^0t
zZLd?!Wu>z(tv5o(+T@dHn<5X3^8EN2D~oimS84Y&t}{$&a#(-$e$uCFMM?kqw;OD*
zG7xNhaA)Udf4iS-)#t+QczEqvx-O63-iqPFkH_mzd^;UE>+|~?fhXb*i^tdKUhQp{
zuT!+Tb>!cPDZDHWzq01se75@k0pF_=n1Alku{<|lp;!9EANCd==K?SJIcHX9PU~D#
z*;*3*Ir(d*^v!8nKD%cwWn^JuU|?a2lr#Tuc+-J){`TiwhaI*b)t5C7n5XcrE9JY$
z2}6MmS6MEd2<?{U{ipW1;Jn9k@o#rUUKmtMUwQZFK_RnJHM1ax00+adr=99?p)m?S
z{{40?t9}{mzgJE&Gv)po0S*C<mTVR#t=kR-FL_qRxNX)knYsIuDof(MzgyPNljn}w
z=TVh>Grq}zfq~%@7t5jf`%2|wpZ|aIv%a;;nK8Pk{=kcsUNW29H;P|PmE(v|Q#d)N
zc-ytjrzJ9WCTTKe%iEgWx7)kh<lm`OK@JuU7KUIIfjL&C&)#m|oRoa_&2~)_0R|?<
zw;GEr&0}BRbXdh!a`W<^h5kPSg}t6}tjq!hz=T)@g?jt<R-uo7|2g~cU-Y5E*2MK^
z@27pby81|yM45l%BqqkwESht&H=Ya5;VJRzJ@9R{`G%t}Rz6(v?cg+51px&C2A_it
z1urgKjLxrZ`777T#^j(-5M%fL&L=(JhwDBkI0Rp*x$Kt8&cwjrpinUH&fDKw#T5_E
z8N?>uNc?7+dDrz|@}HYeEK3?2($rft1U4*XF>yBC683)W-;Y{H>{jo;`y_M1Eq4=G
zQ1dl{^2w76>LuUqUa!|$aOuXMd5<4SvO>(4SXFoSevaI`)Y~te%kMN?-R>?PQ?$QZ
zcPIbL4uQ9xho*_Y?6JP_(SBjH*Mm7L1-DD?DYYv1oU=uzX>X$!KPU)6;n?8dz*c@a
z=k$qoN&UJx_N5*NLA?Y4ju(sC=NfN)H`nby<LW7#PGxr*4hBzD*rfXXp@iT)qtyjl
zX7iX`?#<_noEDJIG;3*Du1`|_gCER{OpFW)0t`%zYdjhiPTM#scsx9zq$%)c!5dJ#
zbFdt|ejwQLdeYkEXH4$YUM?uG-ka-mslnvWTD5%NNf83V$4m||HMY*<l@oXIlUHNs
zZc+8;SeZ3TXqAtH@zr&_oggPOJZ^VjFlyChn|~>?ymwDi!eQw@t&>mju`n<+IDGhj
z`{mKEZuNUBcO2JBZIWKie9d9`(d#S3x|5t|vAOZFoOyV@<<``aZ+DX;yc!%D92md^
z2aEXmOT779%CbQoQr9&9PjCW7;g7Sdb(foOzx-Q$=CR#Wg$ub~-(RgdCG1jm=fHbe
z4v9}69e3G2KHl0S!^FtM*ziYDpuy+CEN8{*WqXe^yqtAd-p)~hfuZ4~Fvkrk%Sop!
zpJ&~1b*uj{ul{vJpY(>G-=u9qJr~(_#U^S@cQ_DZ;cBJb@^$@XA7QX*Zxlhn$P;6u
zdu5HlrnR8VbkH;6gMe(tBgvzX6tLsdmAyPui)WsTPUS1{oO!_PQz5gin!m34v*R5(
zlXUhw9GLL<{-kBTm;6f<eU(*;yLef|=hgqayF%k~OHT*KVU2Vq;jj9yzLuMsym`u)
zpv1&DJKsD0^qbW>Im&wH(zZ`m5IFFJo2BRRr^veVVOHPc6l4ACW}RHVYOW+(;?d1Z
zw$Do!;9y`#_{7RIjk85&>yC>L^0%>6<zC-&Xv5_<xmR{{*YA4Rf3_}FhGT~QZqJi7
zvvMbY+P{g3@$#ihp^-NuA|vaY7cW|5XImSVvZV0H%j0{$+r@^2M24C&{?CZL`%c-t
z?=yozVWWcHIiu=tLf>)~dlrA3oxjgnp}|v{V@5vP3r1_rx$fnu6F0SP+1g?M$@08{
z00YB;CwwfI6y6sf^4VIw{NVokN>3Z&mp0~g=l7qBou=U1bMsDwW@Ce5?bnjO(oSuG
z{~8sRl)cmG>on}O_*?Pq=5|>=c7BFUJS-nSnqR$U_StB`9R~&u7R3dBAW1mqLQ3DV
zl`-8V_kt$xKP%<KY#a8!ZtrrVz%A|@ZCOuovz&S9{O6u*$<JbwP}@}IgWGajPw}%b
zaIjpPHA{_!$w9#Zlqdom6%xv2F8EB%Zao*A{DG}sx<l%g;3r$wgIhUoJ4<s&6urrg
z+T3(**W_iHheSCR%~>nS!NPDrL7?G{BB&r@U|?dDJ~^j2lW8_*yA|(_Ye#pP9pTOG
z=O|T3IFX*aC1IEGbXgyPNe&9Sy9M4FI`{ftelI1+!O-BS5Woj<0|QHw$hoZ+dP}(3
zkIWUk^ylVQ!>AeZ?#7mhhU!zB3RswqJzQXGn)21##Me1^2PZd+4zI`JFKX8xWEDM(
z{<jJg^`HU@L^CupG$v#ePs#SwIqRu%QpPi|)l$HqZ?gS_WxgpFAD1LCJ2qasI?c6z
z+sAMx1yEfCa{?0!Q)b!QTT(Bh1hswq7rdPF;W5|$`?>-K|5M-F{8_bQ+l&A4d$P>6
zF3s-X(ke@SBUODxgQ=-&#R`rU933k(6e5!HBQ`2_Xt;`O(AK)!*12-tOz(~VPZ>@A
zQ896*@%MX@dwzZUHOIvG%p7au@)>_^tJl|lZrV{g(O~;*gS$2k4Qm>g@O^DR-gUiv
zv%I{~RV9ap!&9eB5fzO+dvfQ5dGkuYyztDNV^ykV{o5(<qUaZ|Ed^<fFYoRwUccbM
zr<<E^9`Bp`tN7o{8TNMKcCuzaB7}EWZ)Yh!arW%_6Bgh9-I22{v(nc7{^;l-Y4f~f
z>C=|IM~{{Zdd{<-U;fBtm&w(~$1i`JKV47s?yl0~pDv1~ReXN-mRp?5Z@!)OS$k7c
z-g4KvI>vu8|EDuF`mI`3)u7i?St(w>|NQ?Se9M;p{m8Ske7)y<MHMEN2?;YB7~)&l
zRo6Vx+Fd#^_PlV{IgZwsZ>sd}wcgmPeg55agI~)gc&JQbQ*t=){Z)3{?scxYA<~Ec
z{9nMx)6Kwg>HX<Fiyu|VH}5x=vYNrbk>D1wd-?i@K}{cihy|-Nu}s&Gw=Q^)QBpEz
zeg1_D3x#cM*lyk~-@ftXwS~>U=P%~|cjWHg{|g)+FZ1U=GR?hj&JlmZw;^F+jXxqQ
z@9YjgmBOMiVOCYv)m0Vp^B3svmwEl{+WB+mKL7o*cFnG&f3drGWX#@3*Z+TTb+x&S
z^)$8{7g-Fo4qdGM&FAiN<lGEFlZp>5>tfk0Exu$_<QW=-hF`baTg5AHp68a3;8F8u
z+LbGl!`I!(_|;*h|6WMx-h~S*G=u;A{wn?VcjlYE)&Hlj{Pxy5>r+a7&bHd$b}qG1
z_m}RF)biaYsin2bfhBCM*9=4D4f*#4&z+0g@#}qla&$C*`t!KjUp+I7R2e=WU&=UL
z(IH`9fu-O4b9Z(W-riHW@bU5ACGYKST-><4*Y`Tx{eyZtDnIZ@8l^P#pTBc|zM{HA
z)|oxYg)vWmv-nJyseNwahv!R|iu!(&IAZo+xq(4_@5RJJ{+zR)b@ehcX&aUqRfqKD
zUg0wit(|}OocOur9lw=$J&SoRCLf*fukGjE?B-q9*veLNzv-!8IrF(<+n2-th0M<g
zes@`9Z=$WIu*OYkqQ|l>2FBx^0h^?MIL*1~=<q{V_e!*o!UUC&yTU=jJc;oKXE=P=
z7R1T%N;t~y|GBMZ+AUL4jYX9l8d~@L{oViL=h^RHWKvR}-rKXYxg;j9r%`uf?e}-_
zXJ>8w@Zq8Gj3qzUAKjd$>vW^-L~x?d9E;C~um4@NC|f&h4Wr10t#7X_Y877gy}w?=
zSFS$)<|WRB+zIY3M`o=KFYP?csIx8SW~;`o;^+U~-d=6L{m1Uy)nVs2QX4bX{P}L|
zDEvKDd)MuBg;zJWWU8mVes*np?eBB{|83Xx6Zvj-AT7S;;>oL9zaQz`*V!4pL3ml~
zgn17h8~0Zm`^<Py{r1+`{VfI7<;!Y5_sTC|d&eTWeWPsN<3{HFeSLB568#K~nWs*k
z<mc`_CN8jiZ*KH-`~2i{8Tal8J~(zdTchJfr?BKQTjvj=(gq1z9@QEN3E$rQoWVG{
zTYUYC%jp(nH(ms)`Ft?^?b)PVk$!8-%^R~nsdI5f1w}@F{mI;<oqqI&XWNSP|Ba5E
zxY)L~yVYH^cM;=5hcyzW&koNQO51Jvs<p>nWTTW|XKqNJp+!sQj0JzDRVYP9So?Ck
zQ)S!n=xbZ{`5xX4GZVM1-mzo;j2HSTZxZ+`4|KaIP4r;lxN^C1){SYVin&J{zVTjY
zXk}vZJaqT;qk9^Kb?2oFnB){3d>!9@%zQLA*{795K%wUD#$^3#tE0D<J==5kGt;eY
zxz>p*zdbk@QoO9|hLHZRJBJhB?n+cBe|u}~iWL@XLY3YaR{q;t&41j_<OnEpyDA-S
zV2n4<KGx23=i-`~vO@CD&P3LHc(CzO(A#)kZtnH#A~z>~cu;6DVP^2(Pp8)>CqL%q
zUT^TZ<m)Tn3ykk_Up~6D`{Bj_*2mAy*T-6$UsmgHVVV*mpkUy&!j_XK=FyWYxA&xX
zn9dN9xcu&}=F_Lg*7nL<mGJE3JoWqk_5C@|&vF}Y|MfPlaK4?gsGZa8ZR>WIEj9Q&
zWnP_FrPwKd<A9wH9;E&kmSm}1**3GUjX#Vd>qDG~VVA8&u8hEu-<>ynwks~;{#s?f
znDO9Qo5cJri7x{#&T)Pm!XW0-FWjfT@zd=dNVuNyl8LyzDXlWva6|KhyN-<!(NYQ%
zRAOv2kNxgacrl|gxnY5wpe~QQ1ee(c^`+pj%}%(n!878wsoJ_{Yfe7id+mDpy_v?(
zXYmLq*h_w8I>Gv&p6j32rpFf+3iHW$q^5B{d~~2O*=Lq$`dT?hk)q9$CvR8myQ)>M
zqL|n{{rZM!y0Tk$34b_P{Y_@plr3*BE<XN@k%=YQjiG7DlC>wUIA~;iI^*6mqerKZ
ziAAgQV-jQ76<L{G#XBecY7D4<6Oq3|RD#*+J=2Fmd!tDl@9ycE>X)3~wEg|Ungy1{
zPBSW<8ri;IY~<aRclWnFGshe|78W*F2L-h=B41co4p}=q$Y)V#IO_1KYR|vIPpl#Q
zZ(P0<izKUUm~$8!K0ZZCIdPj8ZI!s<_hIWJ_02E&IL@rNX+2%(n2*R`nS0Dl<$t+_
zqprSv5jR=<u-x4gEtiG`3m%-Bq&jg0@3SS0v$u%!D{OP`Kge;_AU&3m@!R|vKjw-E
zWc|>bGiT4WwYM`qaUMSs@Uo$++;2l}91Gu?=G{rRx7li}O-ti=`sm)h#<#ahCutd(
z?BU&T*eo|@Vo1}|fWM54GVf0vI-I;_?e1B{w`QAfkKa*XIaOkX&e8juQq%X>E&lg6
z-#RzH^rhXKD=Yn7j%0n6v~_e`cqj8fJ&WCfGdyeK_wRJ&c7M89;Vo+?+Zmp-bEQ{3
zlk7O{-tgcu154}6q&@tPD&GoA8ceR7S74YRI=@7VvC~CKP=P~wiJ`)S8@yXq^yZ~<
z<Z1uXO6AR9;CM89{{)7{b5&*Uy9x}#VvH^bBt%PJ{`a%o&s05~tNijzD?x=5r%p-Q
zR?XV-Xn8zKcSzIGZsiF)>zNfCK3ot_eZD$u?Q|xmv+DfGO;fe6&n!51`n0y-3I&5}
z8aJMu*_?b_Z?gJ%n@XeRnM>A2fB*8v(#5Lr7w?5dzO&0-uyahfS@-PB&kGkaQp#sr
zhQ{nTJG)#+;Q`l8?QU`X_M7eVzFX?G8wYV38E+1~B~UPbscy8@<9X?68Mn4r-g(dc
zM}J~QLf`4dv!eM{>Qy*I+9@bFD9pL1rX#uFdCD4>`08mA2i}Po33a+C)vjuIkg!JJ
z3FEu&kjlRd2j9PbrTf}!_f?(EqRdiOg0Gl3UKQ*(BBuY%;<{LV=At?qbA1Uumuqk4
zE8eqM@UMe~BPgivKo9TloTkQG)icf<l!+^LXmI_v)H``Y$(s951pg@XAGl$9wUv8%
zp_oMJ0hZX^d>x&g=HK`_dt$QFuP^3wU}O7bn$Wh3k!k%*WAO#@Ek8e(6u)lH>tNy7
zvDa^c#HCHC!FQf}@E!WhWvh0jz+Y#T_Pz5zuE<{8cu3~G!v#IlVz)G5<Iv~lo+|$D
zsGc%mf`@m0u}1?#7>A~ItL3{(PfBGo&Nk@ryt!4Gwo+-LhYAzRpZJO&Z&lB|S|Y5#
zAiLuBwXM(IBrPo1zawWyqrcLRh6ZPLv#pht2Y&gsot`bTUfkT7r@wgCy5AGC1Qi-y
zI&9Bl{{PeeZCKCG_ovJ<`Wn8v<uW#=u(j=fEmg6Y<?xCX`A<)M%?Mg>evO6ZP3Lw$
zr?Aac6B?i7HuR{?)=KL*oH#L_-=SeHvuoz3NHyP2M^7p`EKmq3KHx53SP-maP$P3g
zph*1Q#hIeg+V{1RkNX{0HJiFlE;PBW(~^aS#W=XNL^`iZJVz>lJ^Nxq0|VoW1OKKz
z@^KF|wmP71p0HYNWAsmOHlMGg&@j(-ozS^=u9kaWB(GOgSivlNBPV{cn8K7f2LoG9
zB_;mk__HIfK|wB-Ico2X%MZ_T9^W20_ujnMU+T~A-fR2(n^pad&%0kAdF6d3$)$mT
z_kW<wY1?UXm4_4+wGHZd85!L!vurOg4Ssp7{$I_!-M<;;ian{k=rsNPzT3GXvJP`D
zA5ma#&%S$$wfz{&8FrQ`o{ID%M{`pI7@2;CiNDyvdAU)|J>l91g(pT#B_2=Taiz0e
z^;9#r2>E&Q_jmc$66Ul?7qT4B-V7*9S8!0snWulIOgydp*LuU{lOJR+-oT-M=3UR(
zD?EjLyLVJJGARi7<q0v}y?TrH+=kZ8lLT(KM%-}ckW|o-)M)!6{=B-=V#55rFOpXq
zJrv>VND68w@zV)c5$belXjnWc+v${+;UDe9wyiT87T7JUyzBP*))mjT?A`bOUt&@a
zC}m>t2=d7JSrfhc`pR$lMzN|UAL<_1-=FvX)3Sm;pC_GtbUDfG*Nuh;kB%-n*nIr;
zws*%R4es6F@X-1*qvK`%6qXf{-;Z65+@SDh@timp?%?I;oYa}Fvc0^$)9lTzO+7bm
zxSVET(Fk(rc41;E($dtNYpDEU{{KG~=LLKuYK)a1&sntmSl$DVcbhwyrBXt|!t82(
zs7#IE<<n6!bw22+$2=wE-p~90?+NzWOs!KEP-w`DcY3~fKTp|5l}{gBciw9}?fxL*
zhR@lA>3&>CSw9{4v0%rFekl_LmQvwxf5C5VT$0w2fo$5lKfX!)x!HZ{`FGO~1UEdD
z7xu4DzagpS(kJ@7OgQA4)SLcIf1Y@O$_$4Cm%qj&zIG~HxBCD)i;?6%>04LczwPI%
zcls);=4tNGaQN!Oc(d)br6<1m?&@w~dvfgd9J%%De_RQBvp7CnUQi$;B;?m`eUq}I
z5x3;D(@vD<UOcqYm7D!f_m+mVKkIU=%H;~aywF_B$e8}A<HmoP_y3M;F)ePntnQam
zFnj*OnN1TfZ$5g`<x%bGFuzL<_cuK|`@gu&@X8x@v)&UmVR!ZZ?buyiUo0h@FyrQX
z2K84T9tuxm<M^;J;B{YUJQGWiQ-0sKw|CF%uRQZnd4|bJ|HRaN3l^*>eqMK~{eUx%
ze2e7zJ9lbo$~P2G{idBEe7GmP>>I0v=7+L8Y5Dz~n%xYIJuSN2JiTj=cSx*WZltSn
zXSa?^kI0TlW=6fFOF7f3TWmF|zg-SqYIRD!K4EHk>m22_BMi(%Tzl49t}k%ll(e4l
z6x9CX5cn{)Jj6j+rhaC83s*xzd%ADllphN^JTLJJ9Qp6ZJ}G+No%<ipYA)aYT-D@&
zeEw$J_`6XdpAF_tv}03nu>Mu{)=D~BoMC&@$H%oz&C?ZiYyST7H8gzf?5sTTpH<-^
z`6QcU>p#iK5|ff^eexX|m=`)+Kb(Bw#KhnSvF6p^l&&x*pL_B7{i|Ei%Od7R2kr^@
z`A0umY_;8csrTOv4m<C(&N)`V{N~2S{PXjicbCb6+U`Bf;yMw(-frJ~F@|BH{iJ(4
zBR}p?dL^!YaW9kn@s{8&AFk}zc?GQ{#jm?Bchn!RJ$-X(lJzfcaXnL2g#(U->jV-X
z#Xm9o@IC&*i%>NOhTQx6_}AL{=;?0U{@vwH_w3_Gk8;QRpSwO`^2Nm>74@Ib`n;Rx
z-ur9YqD5Vjnp;F%^8>#Ix9WCWdhF-6LiCXwtGYn0wGT533kT2rmwX&Y_6D{|n#ovg
zsA(xE<Ny6r)7Z=2M(o^Di7-Eb9hEGCi(4kGeluAdT<15eS{<-GtwW;k{<Ro~hKRpE
z)qd|Xsb8ov+4GXHz=IbaG8G$FA71^KFaP|$^_l_J4#M|hBt9&%=<4B|B_Z(Sx%h(c
zoxiRZmQ+cb=dsOT+H*?YuI9+U=>7kGMVEe^+|+bh+AQa2TJG(#L-Vd4<*e9Q?y|74
zu7Tm<{N>LV95^uHaC`s#9fhkmZSu0p{9*m9=J`3>HH_=x=31nkF-iXYF!8B@w)X6k
zD^|VwcVc32sNSl>MfNHxOs`wG#6|wUzP8rdBbs5>!KLoIHkB|*&%VJ>A#YIH)fLtK
z<J5}K_${r>S&rIWUtgQgvzZweQu6iH)z{(cVoskpacfWI^+m3N&FuagHWiuMo!QKM
z{erXr!_6BvD|uur*vw`oCSKGIKj!LwytRG(9D92ivuXyn4EOe#e=JqqYa}C+l^hD5
zp8BfjtTyAo%MZca;(G4s2aBH<72IBsBiH?B<x2JIYi>4VwaVGHEf8(mxP)Pj)5DPC
zM^%fzGkmH1n51YN^!a(a<W;F_H+O#DS-kU%$HS7s{-(`c?ktQ<+^j6m`43bvuyT8q
zB)l~#v%URg%c<S2N}m*JroLItGDA?NH~Ez-xX+{$<MgsrQ8g;!j{h7*ZjM*xH?Pi{
zp?A+t>u=vhae)J!1<t3RNAIsFJ@U@qq~T%Omx}MrvG?MnV#6n;{Fry6Oqj9h!-tHi
z@3%(WJa$QM+6IA-|9-t*?-O|A!N0%l)!*e>w0Hb!OaJ@3|JdrC&l(sE+cp=kpD;ng
zf!DN-k;A~PcUITsMf>6(e|oy{Q{kJhesh1Vn5puKX-Qk|tI)W(nbW3q{<wBJTVvV9
zf_96ytHV?cN}HbVT)TF;p~>;Y8yh@5vUZfcJ;rZu^Y`iWc&U4hKg#bg*#G@f92w~v
zbxX44js^3LoxfhErd}1bH7#Lc*|DkicV9<W*XdKITx)yp%q{zt!xxl1K`lA^wycO-
zUg$IiMjyAr_mY>Dwya+N-lo9nj@%B<g4KVXa(CQ*`TKpn19Qj*=9KRzwy*_wFwW4q
zF2QPOskijgT8^T|IkIUzUf!pFf0gdND>b!cYwGE|w4MVd?)~#l=}u&DNf2OU;$~q9
z{r;eEL4t-x(wgsy<y~h=cdpx~|Awc1{lXw2M-Th5uQJB*;5zg`Q^*WGwO`$hu?J?~
zXKpGlWA=QT9o*L8I{U`=r9PAD8W>nvEgl`aww~|S9?$Zy`KAX>t9_^lHQ2eYru|2-
z<mF%%jtyyj$L@A~zj7obc-fES)Y}=%y|2xio2Rd;IqH7W_S!~Mc9w{)J3Eb!t_W^%
z4Xl{AIKZj&&<7S4&NYT}ZDuykXxYuK5~Z#>^Ny_S^iN7m%sO>Sfsbkz+1+S4b&gdd
zI>48GO>T~jz~{xa&8LEYy}0=J%9Y>~caB-v<e5!%m$a{A)7$>6fkE<?hFHM-h(-mm
z6-?Vz?bfVpV7Rz|JHa>bqNz&c4BK^)A_@-MIBtDf-QT-+Ph0-IJ&*s@Ze2Ii<89<_
zvz6Y{8(;A=R&6bMYPFNyV}0D=Ddnv0q9rP+$2jgBjQG#0vVnzNTKweVcWiQ8F_T>r
zBAHmGadNb**{if{l>>WNU-AaID^BN6<#Bx6w?QSuLy5s90TjBw)-W)J#HuRA=3Z4P
zcr@#R+1Jjsugbo!+bN-L^FV+9Y<`Zc>)TiO=lw0}zxw)dU;d^T-y}wv<fiG@nX={P
zTspsh0t2JaD#6Lc)4M<FbsJSr-d&$KRXXf{$HX1$f80noG2!F(eDQ2njt$OF4#?YN
zbm>}N`jLKSM)#$X((Xm`7#NLe8MkfCdwXW?^A5qui7n^O2#4)tW_F&K%B#{?$Ko@0
z!?bSsz)HK=n5x{Hn>O#-HA_<S0+)-+o#2ZGEByTKEmKrD@ML3-w4h+O^dw2m*7xEJ
zjb7V5b{NmFx3S%S>dcvKG7bVFBFpyqKe@0__#&s1mt?^D`E|cIzs*c2@JI@(To`Dk
zI$iI{+q3_J{O8MMToD$X%3a|h_4+i2G}8v-Wl<-VJG4&!#O|&pD0hX0Lq|X$AvE6S
z!}R6MVM!C(ClpIaXT8YJjWBdtIZd%dz*2C6hsvdNb`F(LF>d!qS(DhWr<@T#RXkH=
zFQem*tDoP>#>sE4pM7+@v_blwlW)I&I;M2^u+Flr90Ccf5(hWW5|0$w`YGP3^5jNG
zNAD}948`qt#rXwR@N>kTjhyCwjHT#eA=?!RnFUR;Y84(#tq=cMWX+iAc57R%en*cN
zx5<x<zH_A-ikEImWq!e69BQ!kz>y0R9aFj4R6?dto0iGMvB7-F!S>d*DbG5YIljn0
zP<#3Exekkff*^C>l2x}IdTfi2m27GJ`g#5Su<S*hzob~+8r?tMduNflMZle%!HeYN
zeCOF*Zt>>qYX7Hp?1qdEYukgbg)=&T3Mgzi!qe2mV|XH1OyI}O?&@iq=hr9A^0L?%
z_+gs4%a_gKed>_@U|oa5qz@ZrcwUkhV6eH~Qc|<?{L9>FEo@(8BIh<JzB+e4F{+z^
zF)DFoA$vi=i;eaF71Opo;O01OW%~YmhTeiuwu5(eE<9tikW04nW7ebjD}&Qbt3<D@
z@eE<+2sm+e%j93bA6{9>6O%MG^s=GhNBjSOjGNEAS}D2a60cw-Z-mt~jphSV%Q(&R
z#p>A_6qFAKF)$YNdwp-Tt96sHpLcAxuBFpA$N6@LL)V?>m$Q+3%Go=W`H6yHv-qy^
z(E3ReR~^VK(0Ko9Q_|6_Wl=G9O^-NgoF!cv4t!#0+)%nv;r<?n7{ew}aiiX&g|-48
zpTFKK+3BJ*(JtJ9!BkS=fWQZ3N7>hp-^MSuQtou%H(~ho;n|YQ-L_$0J$1ADeN#a`
zXNoetx~E&*eU&%ovlrX%*Bwr>J0vZ7`pP^@%b(AfnMzn5OrQ0cWxKITzzU7N4US>P
zOhS_x4}N)BY#|;fzfj(K-6kofCb7cr@1`1UE`8|4m{eNII8}F!f(awbwrzLI-e0?<
zHi3b$#%e<R;<R%bJ?;m&=P{hI`TOUy{)^=q>DQf~I!bT-vu4d6esSk{SLZiZHJ^8G
z?>pFVV9UOqjBTqvMfc0-ZD|ZSm*!M`G5Po<4uJ?I1&y6*ZTu6LGVl9#XrqhmD$V(S
ze{p-?R{LRjN_JP;G;M90vrj;Es|pj#v7VPd)?V9ac7*M$$o(amVHpj76Apa(aFy}(
zFWubN_3y8*uvAcRU|4M|ak{uU?#s#M>DHG$IP^a{tx>YQ!TGRdW&^{zuHt7pX;B<(
zr4g?m&(k(3vah$>zEOsy&dYca_ncR6-q>8*()9XZczkcT;Oz&OzcyW5pvJ*$ZCUU@
zL#fR4x?rdQ1J^U=CeO)#E`&8_sxz^C$UEY-TWX6*Vw3;kH)pnJiI!*`wYzG=n07wz
z59fxQn_7Az&Ubl(mv7jh!C=>VNbFDSzTIB$|I~!OKIYy(uVMePPje1x6c%y{L?|c-
z6!CX&zMFb~nb=psyeCGrU)H=)yTKx4pSu3d!3Z^NQ0GQa!JtvlXS+~Z#MQRj`|iB_
zp?3G_+n;`Emm1B}W8!4D*6&+uyQATX+eC&&xdsN_R~H`7-Myw)>|=#;`Yg`lcl)l#
zh#sHY?w`V^@Z!S4)A#na&k$>J-8nhfKhCE~dDq)}a}U{FSQo3A^+|0*{g*m+=S1HX
zSFV_t%8D5{IVoMJ?8{WS6LMlkfbg@exvw{6Ugk4qWIVTjWta#HhsZ8zw@0f_-Q^Fv
zzI1BM+9@+<K7Mx1z4x)Elvu*U*tE0zH-9>RD4J1Is7Kvq;)8h_=8_@mU*FlrFv@Sq
zzRo$Pb>)RiOASxm>$)Lb)oswh(8$Wv#B8`|`_`Cw`)5cTm}iq>_3~!^qjgJ*-tsW;
zwN%J|+k57sC#V8yIPirrNZ`Uoy<6X&)~OpX2^$`?+#Y^hChpCffc*=Xwy<#sykO@L
z=zi21RNJ>MOiO1Emvq@vmq?#A>E{=2&l8=^!?B@W#&wgSd3?cd=hF_Ro&2w_shVd!
zNlSme(CBsIy*(QqJ*r-_vPG(B-ES*B_Gk6qa<**~<J__J*7ox7%e;K^Sksu8OxjPM
zyl7<I%)q!u<w}j_26=~L>i+xEZ*eZjy0@pm<*r(=(~pG}6&kj!ANEg>VZ6+8EI2G|
z+u~&ZopW#8EbZ@!>HZz_F2cfo#mB(UwUXxNqJMVw-WSPu=GAb3iIH*L{KUQece<y&
zcz(h!H*x>pQzerhI%W0lkEuRdzL)35-P7PP6)9#WCKe&q`S&gw+}>q(?%&nK<=fL^
zdpAlK)E3`*`Sf4k*OO~s3C*(a&v$Ha2zOvuz3txKpGOZfv(>Ck-0<r#zyJHY+K%pf
z<My^`Gc&z>Yoq&?hh5;qt{2%6m--dld*cruYV9>K+h6xr>=b)QTYZ7{42z5YuV$>+
zzG{`+iY>h^b^jh8zv>gL_^|#EOT6iHp+=1=$r;QX36%j+QRlV@GctWqZfIR1m9;JV
zI^)^1rhfD7x;M<Jey!xGWl@p2_@mh>xheCVG<Z9DeN${yRdchior!xV^T=@a)TzvO
zGM`<xw%^Y%-TCL|vbR<r9vyXJVwuLlaVlR*?CZ4Ja;|sEclI4Axx6{PF3G%z_w(Kq
zb%7hM8WR!|K}kf(A)!u=vCZP}DzzKGezN_(empk7=)ii-69tu}C)moEOCK=RPZJeT
zxF9O<;Onh(=Y;J~ct^y3`|`5?j7?~Gef`f*r7WBW51xMTEj|B!tncB1J#*DnatxOr
zak#vUVG?hnZTZ^YxAW6YrFwfLjrS&+9J=ejJ&$?1PUp0&_g^nB^>)?Mi_22Fbj0QD
z=g$voI&U#<v3a7_ykNW1oaer5ED>35acq}ESU3{)cIvN`5*DucJM)^8(7_)c?Xp(!
zwDJ92b}=2)q*tgqHJP6y;zUK$S;ju4^2fchuWNsQlig9R^~Bo3y4rKz8m?P=e>>dX
zw(it?5&oH7U0mQG`sCm7@7$(o=Qjq=dl?wdpcfE&=u?09@5L@AMjwhN)D}0pKH|*P
z(*yTB6&wzH)L+BBxpw2(kEKz`GaQ_lXX?Gr;wsy+-aE&Gg+t&4AIG;hmK@3qc^&&o
zU(2OUs{8fwIXl0cm`F@p{+|8+7rndN{-EIO?Axo&eogyuBxo`t<F$u{uV$2ad0KYW
z&U)p^w@*ObHTadiotXW^EI~=hc7u5@7dy8H?R@uoQuQJ)dD9aW%Y7s^e_sFWSnq8g
zHWrJ?Cr{p<xlGBSK}uEg#nY`%zt?|xVR+fGu<&Houl`*-zG!Lfx;5kZea+}-|EI<m
z_W#dVvQ9%vU`pN3r`NeOT{mPNNXxKYVGt@JcJyfOlc`Lun)SX3{tZQnY=R0K92_PH
zDgU@}Vrd-LZKoUO?Tvg}=CS&(Zk_bn>7U`@d!moJE+#G5{UTZ4C*0@%ksDj5oa}G`
zck{L!W2p-(72Kv@ZoThhNvq+(`d=aomOYsAc(+*F$+fD+PE0JmoE%|myPnUgxP7C<
zy7<|gjjc~TT<q&?Y|YFV<sQh%#c#`ddxrP_lVd9agMHapWUY2AU$VsLZOOYBxk*cR
z9+$N!@Tsg!O>RA9IY*26=Yxgm=M}ldTI6l3CS~fzsF~hiTD{==d;0{d7cVa_-~T);
zqWH0o#P!)c91P3(<ZO<Ca@*H+GCPjXuxr}NJ})q<Xrh$v4ae|S$L3>N_p5K%@7w7U
zx7=@T<LQP&Q*Z65R5v@UeDT(<)q8jwc@4S_G0XhV{!<a3v#cWAqWM^q8fb)Gjh*Y$
z)|pPX(u#RG3O?^yt$aezFIQ~RbFPQYM;XMwuFo?4VO_(O3JM?<CKf9Lrk+UI*Vk|D
zwR-c%>~+6boBsFGHTSQuP28Vi?$EHHlabNuY3P=%uY)5hT$k>2h>T=i^Y5Z%&6O4T
z4^LcO)Oz^-|9ca~e-yS&ziv>YahZYf8r%6fmglz!cU3<$KWD2qO(yZ)wHMlU=PZ|R
zbFvHheSLp}&8~{-^R7zGZCk&bxELT9e^WnNtox^N?Usy-MGgXa%d9M#PBt+76?wSf
zhb3dvmoNQ192?a7ukhy_ZtYR`TVh<>qVKf7Zq=&ZZTa`lO9`(K{+@20bK}&Lt!Zc3
zSo40^ePs(}i;Hu9I$L&M^|jcYMb7^wPy719u;q3!qxexFMn=Y_SHB;v%6HlMizz>Q
zTJ@1HHH;<lJIY;sZfmKX6KUSHu+L$}y@ONCcRf8kv&REeE;Trq`#i`E6;Ao9bnxNR
zxY^G)NITp(?=7R?us~Shz_fq=xOa3uy}P?!^{e0n;YqAqTwU98Zc2)XblF5_zt%hP
z^cerDr%W}^e==5VG~*Spu>QWQ>TBDwlOMvDE_Hg^xQJn=U2DhI2N7w{PG;WT7JPr-
zVlkbFt`;Wa=5MQ3{r2Du5bU$czqzUVrJByhZJCcx&2@W!Pgz@Ql@BwEo?g&`)%FSx
zN}68uY4DYAoyoj0DJ<;WaaNX_H^tw)wcTm{`Th9`);{vKm7h{>ZMm6MC8-m?Z<gt@
z+}q!_WE`yAwP=#lLr1H!HziN6NarZ$TISx8X<ga0<iOt0dq!%)oE&@jRu)U9G&VFa
zI3CS4tl6M&*^xngPON42mYoi%hNo?H-gS#yuhTyFKJmlm<6l}{F57z|@?rYX4eMvX
zf-c?X!S#h^%S7dQ1r#nw3iQa?PL-DX^3r(Cjvw#XpE21tawlx${>D;Q$1Sk>fR*<n
zrdjK!SXovcTzcNbq2UX=X25l?7t8Lg43;;I<mWnmcSj-nA{Wu;UHSeD>)lJlZft%o
zuxf$tySp7V?C(;n%HD8!doP<E6ztEayHe5aM7p7*z=L#qJ%t0C>Pt#w9=-bf{H&~{
zrT9nDe>)l&YEGRxHMjaRkE~VTsvQ@dE5uw~4rE*i^quc|ZAJp`iMRH*mY?siF84~Y
z;ug`Gf2^b9e?#X~?d#ehrHZ_asqvrovrd}I8!(}{nSE0$6B83l#6$jXLaB?s&-!`y
z=3FPylT7xTm;2xNTkUjEGx2*-;!%%6zx0dO#R3j&`ELG4VcklO_9oHAJl2O>*3|a6
zDAkHg<PZ>0F!)ijqC2m3#(!myk*TRqYk&8JF><sq3+yqMb#Z#%DZD&!Tf16KhGJhs
z`{uIrtS{X*f6Q{{B<PxDPI})fy<L|@;6X@56`zr?fJ29nb;*l0S9fMzKgYZD!NZ5z
zf&FKDL@qho*sv8Q+f3j{@qcm4vr&3Y(u2uqY4iT4Ff!H%#4K(5Gfm+$BV*O?k3K#I
zYbLJz^7;IJgS%TwU*|o!Qee+9i{pf?+|FFPi;m8G6F|eS{f<Ai-?cR<d|RI2oRE6e
zb))WT|HYG!$otK;ivGyFsAJjhqus7zTpSz%BH~<*^NLRYE>-*+anm<)7QfzyCycwN
z=`P7LP&sR+BO4?9qAfeI$~r4Q`{H%Eqa1Cihg<IoIz7z%wqeTOe#n5gfP#WU1IM+?
zpS49B5?B@`Z8>%P_s1^L>;evvZ^<i9Em2}Rx6xopQTdV=>?|zms<rJZSwEx<UVLU^
z+2J#BVs^si1y?PntT`?4{@`PWtZB=FuCKpeQtT{gS^x6iv$ILdW*0cJ^F4_VFjySC
z+{iA|q2Ze1d|m|ycE)wZEDVnx9i6!U?>Fhoz6p98l%LiKh%kkO+b8yNUbQ;4K$yYF
zNa{~OVxnP1%E?;ohZiSI55Hi*%E_vBs$6qh=9!GgZ1cMIg!Ksr8m264Xka+l8^g%B
z^GtMD?vL;_t2Rph`(eAg*3)91%9_uHEZ?0%`fU?tOww$4*jvN+de(zD)6UHiS&IrR
zgI8otkn?~{Lb7mh2s{X#`a7&wDtS%J*Z2RoZuyYv=5YGtH1|r~gd=}`K8z@6-f9ti
zVL3mCimFP^G1Uv-rYt%?b9VIp2@_u2xznxX`{3cd<9i+6z5Dy|=FGfRYxdX|Ju$Zs
zxXv$M`%;HRKtRF4&c?ivgGEs(iT84FhF@at+5PPwE2d;_?XX><Wa;`d+cZBeV(}9H
zg^cVBj8^l^T;|-6+ZoQq?sRziT<h<r9VY0VGFuen9&wt5g@xltwq@JSPYnts3_Ps=
zc6eTRv*Fdc0>#if@|h8~Ck38#vevIOJGe!0L5ISDs~cAI&oG&t`X-@V|FK&Ns2o;u
zZ~z4xyWut#jvp6y`^7LYPdz->`n&Th{UYI@VE>{W2h!O&mfWh=$$9iNF8IMGJ+0<=
zE3t~5fwn~^Aq-46^!S>i887>0EU+khV<O7?Sw`@5B!_~6gMjQ_G3f$E#(j5v{2zL^
zyr~jxcrd%2U;eb{tuHSOMQ&^Sv0xD6T4R@!{piV+;D<YkpZ(ct`25t=*HdJL)6Pa+
z(dZ7$+rZU--gkCcgbt_@j<NpFCvc)tfaQC__2P(|yfgT?tPk^QyInu?GG|6&^10}l
z9Y5JCI}C-7+D>#jr?}AN<4bo$j2<ZFRuoR3Z>7Smkl^ssc9nDj&%6m6Sle>vy!$oZ
zRH5O)1_9yCd~JLli!Xc&S!_~u$0EPNZ3ZWY!Gpqv7arjb25D!uYZ-CK@XKrvad2o*
zFuJ6~z_=;lf|r0Xs65>G@~y4Ar8LvKCGu4@D(d;}#^FB?v0Psp{r}akvQH^iGQ1)>
z(!ajRetdYD+28x==N=h(72Zfa1qFwIDTxke`d^-$6XE<M!Z2q-V&Co*tBgsT54`Vd
z{&()tp?htn_qp01w;U>YxbGIfAIwJ@%?nsQ7Eg9)wD)13sAQdR;QEPX*IjR>YG0Rh
zpD(QNfLr%hbJMg1sm<GdO5F(Ep2vJ`{rwa*g%{WJr=)@kK^6`M$%#Cvd<qSZ9!)AJ
z*DPP~qoDKN_40X#1OwkNe?Q9~W1si%cFX_O7dVWywcAYvx<o!a3HjGBS^f9phxrFV
zmDe)G4c&J6kzq>9xby^mE!~jW*rC*P`=@~^JYgNEnaJMA_l$|J;=Zh5Qc*_6lZS_o
zKk4G&5b&@n)8XYj*4JmdVa7^DwT9N&F~SusN(?Mo8iyTXtoJfD&Jt=?cJC8b_|d)M
z$boa`f_*y=E_T1_<F?#y?#4~8R_&R*U_rn&hXr#^v;=W{UG6vcYR0yv`S)#FKetV3
zIdVj|{^iGoe;6<R6Zq=Fvnj&Cq2WQ+wkGyQPUkuQc0W#7Q0@Cfqp-T8piEwS=P^iw
zq=BK)Qi-4G$<tSitOf?r7uq$0napy2=;`an=Y_Fwq)2U!=iu)C_T|c&7YyMntY=;v
z|M#%f!M#>m+D^e?IrEXpRkK9Gc%{ucrE?$cb2`ex*yMj=iNH$dhYt<+|NZ9UKlxBd
z#GE4g6E7|XuU@@c)N9?8+bjtTCHMCK-=KId_(f!gvGVkt!t(nim%Bc6`1kMNn#P1^
z4weNQY!!5F)e9+X$e4WS^-d`Urau;K{d*JR$`ikCKXlOaO_AE_Whso_pb-)WhX#iO
z{Rt+V3M=~a<BAW>&QxJy`7r18#pI4{&zL?4y}w<4<;+a(Nt3ocdm0-wgNKo6T>;ZW
z--cBG=xF{H;k0-K7eTp`CvV&3%x`;?l{ee@ik-*Z>m@=}8jIRnyUg0xWoFJ~;jmk1
z)-I9Z`|0)i{jY95`;hSaTizXhqrye!zu*6V;7O|;>!}W|8E-y4&3@y5^q87ScG~KK
z^=T<{l6yIIlP{V$Zru6iL)*7S#n0Qc8YFV;3m#n8aNdyN;E>CgA20H(YSx_|hV*@D
zW$J}56P%M8Cd^WZIa~E@MbktN6>s5A21Z6E#{FB;qNSN`OV+Fr_?G;Fjbqkz^=&I}
zn+NRJ)(-M;P^@<O{5@>!+ahjl+Wx-pX}pTi;d<}Y4RaV8mwftk>hf~q#5)fj8gfK3
zq*;k1sCz8^_g>>8-ygx9zg?y|mX+CE_TAuAQ25X)$>Z{l1VvEcxXjl&dTY_6OG`P0
zYnJ?Ls^ak9^W@3e+}l>g_9wPvZl1AdTkh@8N5$ug2t@z>HkV)4D>uVoK|p*1k3x#7
zfh4b-jpIXa_Inejl|8G~jrjj>@l<P_A3x8gPf!I7T9kLW==_#u=Gb7!!+kU|uPycP
zQQ1VNdlS>A9=)>g!X{TGa0_O}y}+21VCmbd%+~ZOF0lReJt2dS<CSpqwcPhX)p}E=
zJ~w>p(!dbiC;z^Adz|nNhH0Ax)|9>drD`SO=6ZCl#l&;v8<ZUqwh7;Ubyd3HWJB_V
zfCMSE16A+e-0c20$zzdBod@&6Krb(?%f6bL?mN2=NN9BJto*#Km4S(g<;SD)H#c~1
zuPE4|ll<#NyZk&hcKyS>Xa6cRHg;E9TK|0aj`^v4=Z{|&E^g~4nyp*0;?VOTi$_N~
z_xZ$aFb<e!V6|sYdb*ietmx&eIU6@F1$kQaI<MkiSJB9sH<lf{9yjN`=DL4wcchPQ
z;<t1D?JU&!g|pKIRKnc-HSOz1-B)2-*YIX0f0%60Xx6}B%KG)BY}TbeM<*HkWD6=Z
zFfmX6^Y{I=S34U*So%UUBwoI~TfJk)54q<BEF2{f`e)CaiQ8i#^l)9~42Eef2Bq0M
z4r>PgS^D=w%!Nra@|?Dr@g1CNb1hafMG|D8pP!x8x5E5+=jQ%?`dr+ow&f1Xmro&|
zT{}8E)4Nu!+SQ-J-Wto;<2UPxw(XZom(OcS2sF$;w4m`M_jbqE*Yt`EnCk!8X(s&s
zR_nm<L%3C1+BQH(RzN_ZA|k6pu6qf?(i%qlg2s1Ai|kL|__yYXR;kjD7HAAth@VMb
zlOHI4b646u@s9tzm(Q}YluGYgVQq1#pt8G;vy_G7K}&mhoayOvetx#>{Brvf4sMdQ
z{<b)4?n^x;mOm{_Gx(BtgA?C|oHNn0sL;$_pz6Ko${pqdp8oUyo%kYHqkTnTmFMI?
zI_v@h3L09w+BVhX72I1FtM0PXNB-`v&^PbSy*<`+<6heReTRFNU%0U1+1a<E=iK_`
z;_vP%jn3*`@j&s_iyNC4M!F@Nzx}!{mR)PkF8%#7(Lq6awo6r06wH@yOkN%y5)x$r
z8i?-XtZTB76-ZLsa5?(sTJAF@6FpRfUjE-)KJ9&=O1AWW{kX%4s?v*G3d`><IB3fz
zs3WJMxMTP3$qOPkwFoG_%Mj{V^M3b+%3bQK<<FlzH&KQ!_^z?-w1+#_|7!3l4-XIj
z9(sPgT%tole#3+Jx&F`ZZ!7<o8oPaaC+D<7Qx9Y~FsyHk|6ca{>zRbue`Z<R)+nB8
zcyM9i<QqqWHmS!tIWG?6;Wx9q>Bqyt6Or+B<;vX`RcHKtW$XB01^?xBzN>aRd+q-I
z%O&G7tKrqCmJ*2#9}<}~ow|-3@%_fkAt9jPc&vqUb3%2uKa;bSTH4>&*Vmt9+Ert@
z*PTEA=*0II4{IA6&z@59>x-(2xJVe!+uFx&Y<8QApS5ra2_zUPRJgxcYmj?O>0qmk
z4cp?wJ1*?0<l?h?eJxk<#6Lzx=BCZBQ|4S2P_UTnp(ECPG|AyXMT5irdu}p0Z!byx
z4^Lv6Q@k<C;e+(8!uQwCga*ZmMaJ=PT)OO<y6fM~38@V^T5K#yscCi*n^d^eCP=ww
z|9N)V|L^+v{l5P%oSiM{=l8Gc_^usa_SgQNr`n`;f$>IcmY2jF+v--v#=i_q8cIrR
zaUNajjsF8<V{K<f=imFob6m_cFQE2!pX*YdprFjmosVYZ+}yN%$Bvo{3pkl3EiJz!
zsI8~x`rse)tHK{2JZlQr_~&KJa7l3c`QU4&ctZmt<0j4%igIkPMc%|*`E)n@$%Leh
z5jt!vJRDzm1=$xctj(@sV1HrQ$RO}%*YeBn+3kLs*7k+e&P?W7!^RQdasGV%lt-D%
z{bDnYNk#E+=xpKKU-)=k-rW#GrWvZ%WikuuSFBiZ<#hIk|Nqt(Ni}_az5U;x&*xLZ
z6mu9;KZHok%-8+%=Z_N`OBpN2gVrlMH}8Jrxj}y3^zekn=n5`p+2`SJ7&cW;veJzZ
zXmdWI%6xAA{;n9gwz|t}qt{PXKR<a=fe=ePN6hv`iyr0O-SzyHmBWKo4L%!xTrp6{
z5N~j(ifeOXV`DrxD={S40Tf+DAr1!`7U-(|NUhy|P4rhs^LOT^_=|dd6TA7NC3y`L
z4)mv=2{`xu>6<&xZ*8?sNj6jn5S!DxyZn2QaK?tRw`y67GcSuV39Puu9wz^G&Agz4
zKB0$AnV6nEc*Vo<h6}X9W#_JM;S-q-u>Smie@~_JGvRY(kB+d4_8BKc&s({2x5wN7
zL5G;&f-^H5mi&KlYwL6cMn^|SL(P?o7VYw%zmZAFEb~%KN%V}ABbD#&2sB9uY~ki_
znkH6xoXMG6S5lHIUsef}=qzm%5*!-7&9*aiOftXw$-kg;Tf>9xdt>XYAKtFAkIT|g
zoOYFwv1RJIN6&P>Z8~%~x!2a1nQ1l;x6w4sTizP6g0g$3-Fl&K?NTWJW9{wFN4qmq
z?EZea?0@p)?W-yQ4l=)AUG-!!RM?>I&~Tn9>E+co50#@ED;M(!Jh89+C1hF}At2ja
zpcSMZ!s*D<XTaNNANQe8mbcE#P&i~)Ol!*;u_gByDi5w~(QmbWBr74XMcjeGQ#@Fe
zIg=wr#Nx)*Df{viK;AG^C@68bpk{dRncmuswj~SaGYR~ebnEH%#d2#U1&iL+Cf(!{
zSYQ`+e%^|dbsKiPzh57d_fC1!MhO9p*FQdPHh3($cI(k@<y$_YKFu=b+Zli8i?X}T
zy4bm3%^IQefxW*O^0;PxkQBHft#IM8YQ(Cp?|mLEvAg;H{&@U<vSmxa^k#Pd6(6S`
z)a{X#USYRCEjQOzZt0Cqh7yMl%~Q37L#wrAUS2xtEOTAiC&fZTTYI-HBNH17|N2u8
z_pvSJ3FBMv$On{F#F&}bSy-5SHmv@vuNfPiZ2#|RNJC6n)4qCFooFVezibO0%`@J%
z%4UMYJjTYq%uEF`rpxTgYH#~ZJ$7u_%gdt68|!{wdH>+x<n{6U70>>9xBI=YklD-f
zq)?uhE>m(691_|a6m;MA*(t<t?3J+;U2Y^E)x^H<lj~7UX5O^89?@T+o$?#|?HC)S
zQd4SPJ~I<=pLy-Ultw3^w;Tcn3Jab;-L>SSMgPHvw!7y=fU-ct0Yk>d1&Z<y1M}~0
z=1IEAsL&7;RJLm6()TYdUwnFM!p2g@$KmC5s_LuMnvAb!XNmqf7t*+XliixAjlEK)
zix(_#ss7kxoUT`0#TS;zt*Wv_=t#qXe+-Q0j1v##R4A_8aP{u>HId){e!oB2%+%CW
zd1LDB1rmjchi03<pJZrN@WAEzx<~KVE{Hj_!o9!CV{TT~ygmuVPYVRu?;bg(CVZui
zfsvWX!%9wB_ig3`!6Q72AMrp!xrv`sU_t4x$1T(E1bFEDW?(7M|Ncd$>2PRD=j`a%
zzcaRUG9LW)_P4UqqoN|YiL6tmPJR67lD{FlV@it5-`$%wfBy5wZH0;7G~MoN68`gM
zM;oM_;hDNj@PUYyj*-F!e+P!#dwZha@Z48>oSE-(aDtg)|AHNQ%Z{y2Dl6N!d$*?a
zZ;5p~i<=MXUr9DP@`l4t<J;4n1rMWkGqdn;ta!0*&!YK>>P_cMs-0iVc7r*XOW@DR
zh6D2$7)50E1YPW8ux{~C+n(fV(^G8JzAkQW<*O^KB|n;(SWf)vYvGf4`|vO`pR5(D
za_^Z{e}DID<<>C?)J&VOVd6hfALZ5G6~e;ivyO3mI1;!s;oO`6AN8qj*Y{K=u8%V{
zP}mUd5D*k(S@T0=s{iHt^(t(Pmk(??#j5T<ulaGO*Oo6Q)$82*=b0sTv9X*Hj&Tw=
zw;}Oxn30=xxtz!nMFErg|A9x3K7Gq4V4%?Oas6#+t*98?&(rec)gZYv!QsFrMFoeh
z{3pAEHtwjOZ7#k0_|&PUOB^>`PD(OM=;!+>xIO3Q=Z6eTHnJ)PuP+?*&DX5`voYV{
zz#n1tbF)`ZHjogQ)?=)FZH~9^-JkpZyuDeVbl{<u<iRdcPE%9=g5YCQbSrvSE8K7V
zC6Rxl;@5(_-K;D;91Fg_-o8r3F2m05u*30v|9(XqxpnwDgxS}uyfXV;^6_&k)~r!_
zytVK#U+pv3#Zlki?bVC-Q%`O&VbR!>z@#w4+FWG)ysM0i%uMQ)ziZYi&nb=LyWGE_
zv54)Wv*vTFnX#djA-_~Nf-(h25gSX_p+JotON$b_<{i+zX2M_j#`3V7TiM2Xy?qn!
z@pS*=XAziVUoSd){(Ff8Kfk@{=lSfazMOe{+~2b~AV$H+XkT%<=~b@1=jQCp(6F6t
z_<N~d@w1jC%kTeuHalNbUGCC3a}yI;%c80y9h$FrZT=KpT*Pr_)?8*LHWrRO@)iYq
zPN{d#SG0^T`O<dj64#VnA%~|<m};PzDXjH><$RmD*46Ls)z0v_ba(gnxpRL@WpDZY
ztB=9P^wZW{fs|;GJyjg96;AwWXka|Jt#V0V@A;>1xm6k%7?nL{YE4a<wDZeV?_HG+
zan_F8%bKPX_k#R<6jb*!Gr8W~GxOnV>zJ=xS1S)Re7wEwZS%r)_qMe0+LXR>%GtCv
z_R-_>HyjT0T3IPL%+FT$pO=0j?$xEE#WyxI+_=G8{w`(F1t~|jcg}jTyO`w8y~^3l
z$S?m)T7zjmFMq+}Ija*lx3;c5_{Z^u>9YI3FF0^}U5npcR(!B2RlfdBvX6`TkHGy~
zw%lg9X|9mq!0`4?CC9x_$9&|bFLY*}G<mzInArahhxxzQ?XCMObya)9Hs+X<D^4-J
zICiM@wC_K;M}Lq0&Ug^#veW)#g{s;zDd}?!;Xkge)jwgB<iOzR9)6mE<-O_M!fnr@
zF0Gi8U6hg^x#tCw@`?S9vs7(&-P~pEA^xQ6-1bLXIv=liV1}Fyy88X!-ni)dH%lV(
zWP(WUy*<yn_22EEF0#P*#`oPts<w)Xsb5}X7I+;GY%9B`7q{o#jRuC57X#14+bk)#
z)V9~Q@X?#n*V`1BBlGeWG8{~(;O6T3Xq+PR<(u59d%HL|1kMUiF=jYoSmb?~bxQiN
zo};g?<+@ooxVedao7~;y_5OZ+M-c0d>4lHeW^It?HlO_ChX|-8!0vG2&Ye94i90q%
z{tf6^5Wjz;_cWJ>-c9`slTyV*rz&`}PPjbJ*8PEqamEFf0`}zGQk89c>i#a9rX^hv
zYxru>f#T@x?^GX`z6{#+;BfYWNhh|fT6H`{(omt{NpR<mo8K9m&Ob4Aeo?5HIjyE}
zv&@y?9Y=~T9qpU)Zmv?WYN(CFU6)MP`k!-6xMrsXSDJMm1(kB3cA|tpNVq*GPfVBT
z#*JZjme2piFZt=`BG-wl!`9y2kQiO`sqD)AL+gzGuqG@ztQ9tq@#9+WLtC@A%ZLit
zR7_ZWi81=W^c4eV1%^fZ1*>juc0P1yk>L_eP3xebZOQu$wX_)jF+Dzhz9q76equ%8
zlwTZzDvjc^-5ebkEt%Ss_gLeK7YpaFC&KY2e;C&|)@wZu-e>7Q(}-WqmbvL06Nkc&
z{R)+GeMV)U4p=k9Pm;5(QZ#I@`}ANpBMWCr+kaC<afUeIOUu?qf0qn>BcPw0wv5wF
zT&+NK&-N?#)sAt!wCC8wsxTp6_LqVU$GW`Bth@K53y4{tJiTWLySV4>tkor&mmOq8
zBphc<ud(Rr+P@%wb`OV!#^0dww6%F(6>K(rKVro4b;|a0OVwSEAi}GI)xJP~z2dT~
zPu&h@$*r!@dCjV}dO@#*;m>z>ecD-A3|<PF>DYN0zLH?>e&{OvJE~jP@Jej>nov$_
z_9N{3w}(30|NbVctC5>AAC!q(a_iHd+nLR=`N5?dA+XMHG5eHhW$b4A4fC%}5!cW1
zJuPRHc}eE0Ifq)sn;Vg{8Jo1Dqi4O2n)N#O(u$u;r`mA5lgs8UuK$0E{qY9#DYN2M
zd~tPDW_Ge-h<#&lbnCR9ZCM*iG`cpO3y$6TA}Vy}rM1nU?j=;7O~_oiS;E$1xATig
zNu=D9bWnI-`gyym&-Y&OZZ|O3e!Z}nvB`V-r8#p9mi*9{Hh**K-HpPA5RTe!pQSTb
z*Dcy}be?THXOhE(dkhCxhxd9cy|{96`&u=Au1bZYZI*H0o>wfZ$PEqRd{U_z+`3B7
zqWKpKOJt;_86(REt*0mKitb2Q=55Kn_3WnZhD=?44uL=Vt-mF<+;G10baS@rwqN1A
zU$#V?vru~<-`CfvSMqv7-p+qcT)dWoPXhcc>$dFC5AnOrtIwusKbPyhS7qju%BXTZ
z$wuZ&U3Ebn)&gy-gcr$jEMNZ$oZNXhBm@i;1h_R1R(<)f(N6io`lE{%Uw^14!zQ5M
zU77LX)&GC_>9d)cYyy^WUf9jQLQqNYfa#k1=ilA6HP5}}JE0a-*f%AqK76?G*WtA`
zhuQh(RXjcxYcx4*tyhrz6@|V#flmrM)&^AS?vQ@KoyqNx@cY`XEQUx{fh{(=Q<L1e
z#aun)8**ANt#sK{yF~T-tS>Lvp2kcy?(?2*cJg@g)3u+Y7q=f=lF$3PwO)L;gh%@1
zr3vMcUzyf?T|awLb4;n?yR{GR?Fx{2eJATrg4e-ypH{N``XvhX8o0+NA@D%O(=%Ft
zAwTuBKqIH4V_;<^E8jE6g2!%bmu4$poR!PS=*II>rI+{cv9hJ6XKkKrb}+RDl}*tO
z3|Fu3cbQ`olo?Un_IU5^l9yI9{hyzUZN1#PD@u;}{oT__!J)yQS>(m0T6#^17kCS*
zs;={XI(JlvPrUl2e&IA{n+a1Lw%>^sf2Z1Bslde?<8bY|^?~zBf0thJX%dk*$Yyz_
zD!Fgp$rCXZmE3y1pr&k+!-0kd#zrYqGqe0VJA~)m3p3{1*w{Ar+pJm3`DQjhczOBx
z$&=Q9{<xLCc#tN1pHZdDzW$$<<a%Xyzu&+3Hv6e39)7K?AR(}&++o3Y<&Lf{_eEZ>
zH@Lc}&)oYdJ~Oj*{(X+AsjMb-Ss%W>zHrdhbyA?duAUGN#})5g+k*LuE@yQee!gEV
zo_Ff>bvex!*S+8GDxK$bPr_s8b#RzMl8ZrDJKx-v#Lg)d(~A4A&zIc#UGRE?=r-o*
zmMs%X`PC-x{_ViC^6Clhb*#T5V_buDL{_=PY(J*b-K8lJ$RTVh$eHBufTO|T|GaBT
zQ7r2f&m}Kh&n_i#WRmK{ojdza+&T2T<ky#@SyuA0dmW2EO@G|w=l0^{0!QJ8c@DFV
zHaJ9z@|PMqZ~bOj)e#cC&}{C*y~ahCSUew#PMsus>iMLXnVd*jcNwQyZo-2}da*XT
z8;=~f++F^Co~bs6-;X8J4>L7w`}VE(=jZ7=cSgzxKX_6w!L5Y3>-<DBQIRPwE=LZY
zW`1r`t)BQmF(o$tfuTZyio=5IN4ql@yZMI4-#jwo)GR$ayTjY^?nXyPyH5*@iRnt(
z{&Ct|yV}kzn%36$k1n0B^oE-^cIiju^9>GrC&)5vWiWaB>Ui&zS+9F`Bpr4Bd7!98
zTA}XXnT&*))Atl}irxYDgAElD91b)%EPf!@AMs_Ov%UH5@B)VeHWr!(L}w}-XJ&C)
zxInpS%SZ3i7aJRy)pKrcIv%6ku(I^kl~&E*XO9og%;(#|&ZcKq_ebXC(S`$pjE!;z
zM~|yN7Mw9rP@(zRrqqj)lHX5Gva0{bHE-4qbq?nfr?pjbojP57k~gjw<C?puOC?i;
zndxg)pl1GT{p<}_14|Cvu=qc1eY|IR|4Xy84stG^FEkd!SC$K;&Mn?j(*jDg4Go~y
z=9;xfb1yBqIqi0gmZoNZU)=rm_g~1_R%LFCN{q9z{?2;(#@5#ty2VRXrbq~A@O7v%
zFed4VtY6J0@S=4VlXcmf`DVEjmp@OKVs7s3!1(FUL4Hd`#j8T!4BQwlB)+-va(>;b
z_UxHv>;(%ZPyWBev)GV5;A!KQm21{y-PpizwEfY`%t!}=vNt?O?QTdbeE2TE^hUb=
zdd1dO)fB$F`~M5nY3$plysOOgqB5uZ!UiRW!;g+myuR*k(C3|(ZFXz@{wl33Aupma
z(=0c!cCVz*`lH>-N6sI*x^@22Mh=y`_SdtnH!QfK;c$V=VM30QLc@|tK9!esehALE
z5G|vcKPxjy=|EfhlzskQb2VMRO`FJd@y64c_fi$oW|Wkeiry*Rk^&E>*U!%G_ISU)
z;Gt2Wx9?p~F|jhH@9*umB^)dgIq9Cd-fVTvn!C5Rl~4bAch^^^H?~eUA3qL$klZiz
zSgL&U_4W5xStzUyU!R+iq4fFX)6?NAKCY{{+tSkOHdAkdo{{n9@9%7zTv89Wv+vmP
zrB!r^eDu`A-S>Cz;E^y)$_fd&b93|b#H$ljLqos*cyf~aMehH9kL~{5@K3d$m~r6d
z&hIBC247zjS^cv}$|m@POx4FlcAA=@9~Z8PnW?d3QQ6WZIpJ@Z-QFB*mf6Igcx~a5
z6-o00U-5@@oLT<njg7;`na0x(opZmkcUC?}U9*Pz0<J4BT|Qk|vu2K%wv>%^xm?7K
zf}IZ)x$J9xh|F~M5D^mDa_B?rwDoa&{UldqU0*zV*OQaGmTdfXW82xqQOxq&AL*A!
z+g(@vcU<wubmxg*u6i4V>h^O#S-iS5n{8X+@A~wOPp*86PdNYjP;JR;yXk(a9<Ajb
z-Ep={=e)l3QpJJm?yAOJ0sM{D%{)>~K^&{uPONOa^fBU;jkAB(i9k?G4^)<Ih`Mqz
zYwpRjx)v25Sk45#nU{F@&h>JOZwG8`*#2%-G?Vx}`Ns44w$>8cB)eMbf2Nv!yP9jE
z@U1>prh_NNLvlmy@5U1+b{sgcRN2kPk<EQZki@0(Ht`;||JKE60WK~&Ne%@feOo`x
zahX+~BG%LEJJVdB$5G?MR=4oGcmJ=Q|6%70i^6Z0mIfLuynHFozV6Qz#(R9dwGY;N
zIXf%A`Lnm0|M2pJPfva_v-KPf$^6ZGaDJd2^FD(HroceX9}Ov~PnoiAJamZs+&Q5%
zB;2<&^xKVB*)EI2vmdNzOTQ)G6I0-BC%N8x-lAyZh`txSCXD|x=I!FBmkebN=btEi
z{N(OwC&Yu6%vl{0@0%8qtQ-2wLsET-@#`hd6F>DG-KASxqItPwWusA(!Z#M?#1rB{
zPk%q35Iu3$s}5N3-SAqoY*|{)nQ7C*pFS6FlzQ^S#_90<jh(%<X-~v2Y+8}36%rjC
z9~v4Q6Vs=V&XaTd;o5a|LE+zp_HUfLPU+p^O`Dc&*s$iUiH7c4mTdw@1UEz!d~TfF
z^iTTJdC$0zkf^}G=KAOS9H;wOZvT$t6uq~!db#G~Bk^ySOqs72`uyVOq6r+@mY*MM
z-8Ty?wypiTinabh`IG!NcMmU$@D^~ms9(7)w?}SrPtxsexnQ(5((n7uQznaVT#K5&
zarbJj2PcI&`rdQrezx80P*C}!{Ns;fp!1Mfn#A8OHS&2cKKqaKsro-5%#LOA&5PRV
zCU4x~SoARkbigaqLGBlqj{PhQn7iej{6|ferZsl?Dg1v_imM*(C|>lwfPsO5gGK*O
zc+*zn<mo~iUtM2#=y<;*$GYW>|K6wc{<hAOk~#Ozn*ns_ScRp`UE#DZp3%=bpW4?w
zaVV%=mhgjrrnj_u`*d{%28IuxSef|pc$04*yS2Z~orfhqh5wgHvER=P^>HAV=$9%e
zteGkKJIp)#|EGe+gaf-HgFhYBJv^znU5J5!p+c02ae05+(#Of|hc=!HU-;+vcaRa^
ztF5!Y%%AysS8MQXVFu9ojB$n!AAB>|T7B!u{D_a6qnQtWEoHu+uCq#<PtOnH&Z)Wa
z6Z>*zh5gr?_Wwte!ijgk-TeN4-<&?(uiU@z5a>+l#s{T>VLqLEwOUTaFYUEI{qA$z
zwU(Gd=lV?0si2JywhH^nab2!n-rTj+dsch$YlzKYIS?TLKKrwAK$s27UwB!U>45Au
pIB~y9%9=}=fq|5BunRzk^s?}U912^?x^Owj98Xt2mvv4FO#qL)p+^7!

literal 0
HcmV?d00001

diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
index a84f1bbfa0..20f520814d 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
@@ -14,37 +14,50 @@
  limitations under the License.
 -->
 <RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
     android:layout_width="match_parent"
     android:layout_height="match_parent">
 
-    <com.example.android.tflitecamerademo.AutoFitTextureView
-        android:id="@+id/texture"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:layout_alignParentBottom="true"
-        android:layout_alignParentStart="true"
-        android:layout_alignParentTop="true" />
-
-    <FrameLayout
-        android:id="@+id/control"
+    <LinearLayout
         android:layout_width="match_parent"
-        android:layout_height="wrap_content"
-        android:layout_alignParentBottom="true"
-        android:layout_alignParentEnd="true"
-        android:layout_alignParentTop="true"
-        android:layout_toRightOf="@id/texture"
-        android:background="@color/control_background"
-        android:orientation="horizontal">
-
-        <TextView android:id="@+id/text"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
+        android:layout_height="match_parent"
+        android:background="#bb7700"
+        android:orientation="horizontal"
+        android:weightSum="100">
+
+        <LinearLayout
+            android:layout_width="match_parent"
+            android:layout_height="match_parent"
+            android:layout_weight="30"
+            android:orientation="vertical">
+
+            <com.example.android.tflitecamerademo.AutoFitTextureView
+                android:id="@+id/texture"
+                android:layout_width="match_parent"
+                android:layout_height="match_parent"
+                android:layout_weight="100" />
+
+            <ImageView
+                android:id="@+id/logoview"
+                android:layout_width="match_parent"
+                android:layout_height="wrap_content"
+                android:layout_weight="100"
+                android:scaleType="centerCrop"
+                android:src="@drawable/logo" />
+
+        </LinearLayout>
+
+        <TextView
+            android:id="@+id/text"
+            android:layout_width="match_parent"
+            android:layout_height="match_parent"
+            android:layout_weight="70"
+            android:paddingLeft="5dp"
             android:paddingTop="20dp"
             android:textColor="#FFF"
             android:textSize="20sp"
             android:textStyle="bold" />
 
-
-    </FrameLayout>
+    </LinearLayout>
 
 </RelativeLayout>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml
new file mode 100644
index 0000000000..72a229ecdb
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml
@@ -0,0 +1,88 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2014 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#bb7700">
+
+    <com.example.android.tflitecamerademo.AutoFitTextureView
+        android:id="@+id/texture"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_weight="1" />
+
+    <LinearLayout
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:layout_alignParentBottom="true"
+        android:layout_alignParentEnd="false"
+        android:layout_alignParentStart="true"
+        android:layout_alignParentTop="false"
+        android:background="#bb7700"
+        android:orientation="vertical"
+        android:weightSum="100">
+
+        <ImageView
+            android:id="@+id/logoview2"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_weight="30"
+            android:scaleType="fitStart"
+            android:src="@drawable/logo" />
+
+        <TextView
+            android:id="@+id/text"
+            android:layout_width="match_parent"
+            android:layout_height="wrap_content"
+            android:layout_alignParentBottom="true"
+            android:layout_alignParentEnd="true"
+            android:layout_alignParentRight="true"
+            android:layout_weight="30"
+            android:textColor="#FFF"
+            android:textSize="20sp"
+            android:textStyle="bold" />
+
+    </LinearLayout>
+
+    <RelativeLayout
+        android:id="@+id/control2"
+        android:layout_width="match_parent"
+        android:layout_height="135dp"
+        android:layout_alignParentLeft="true"
+        android:layout_alignParentStart="true"
+        android:layout_alignTop="@+id/control"
+        android:layout_marginLeft="300dp"
+        android:layout_marginStart="300dp"
+        android:background="#bb7700">
+
+        <ToggleButton
+            android:id="@+id/button"
+            android:textOff="@string/tflite"
+            android:textOn="@string/nnapi"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_alignParentLeft="true"
+            android:layout_alignParentStart="true" />
+
+        <NumberPicker
+            android:id="@+id/np"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_below="@+id/button"
+            android:visibility="visible" />
+    </RelativeLayout>
+</RelativeLayout>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
index db557ad62f..2c4ce84473 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
@@ -14,9 +14,30 @@
  limitations under the License.
 -->
 <RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    xmlns:tools="http://schemas.android.com/tools"
     android:layout_width="match_parent"
     android:layout_height="match_parent">
 
+    <LinearLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:orientation="vertical"
+        android:weightSum="60">
+
+        <FrameLayout
+            android:id="@+id/control"
+            android:layout_width="match_parent"
+            android:layout_height="match_parent"
+            android:layout_alignParentBottom="true"
+            android:layout_alignParentStart="true"
+            android:layout_weight="60"
+            android:background="#cc7700"
+            android:paddingLeft="20dp"
+            android:paddingStart="20dp">
+
+        </FrameLayout>
+
     <com.example.android.tflitecamerademo.AutoFitTextureView
         android:id="@+id/texture"
         android:layout_width="wrap_content"
@@ -25,29 +46,15 @@
         android:layout_alignParentLeft="true"
         android:layout_alignParentTop="true" />
 
-    <FrameLayout
-        android:id="@+id/control"
-        android:layout_width="match_parent"
-        android:layout_height="135dp"
-        android:layout_alignParentBottom="true"
-        android:layout_alignParentStart="true"
-        android:layout_alignParentLeft="true"
-        android:layout_alignParentEnd="true"
-        android:layout_alignParentRight="true"
-        android:layout_marginEnd="150dp"
-        android:layout_marginRight="150dp"
-        android:background="@color/control_background">
-
         <TextView
             android:id="@+id/text"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:paddingLeft="20dp"
+            android:layout_width="match_parent"
+            android:layout_height="match_parent"
+            android:layout_weight="20"
             android:textColor="#FFF"
             android:textSize="20sp"
             android:textStyle="bold" />
-
-    </FrameLayout>
+    </LinearLayout>
 
     <RelativeLayout
         android:id="@+id/control2"
@@ -58,7 +65,7 @@
         android:layout_alignTop="@+id/control"
         android:layout_marginLeft="300dp"
         android:layout_marginStart="300dp"
-        android:background="@color/control_background">
+        android:background="#bb7700">
 
         <ToggleButton
             android:id="@+id/button"
-- 
GitLab


From bf94bc682f1494aa912ef38628d82942bf4a1fc7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 10:36:13 -0700
Subject: [PATCH 412/450] Automated g4 rollback of changelist 194186080

PiperOrigin-RevId: 194261487
---
 tensorflow/core/grappler/op_types.cc          | 16 ++++++-----
 .../optimizers/arithmetic_optimizer.cc        | 27 +++++++++++++------
 .../optimizers/arithmetic_optimizer.h         |  5 +---
 3 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index f595cf6456..fba6c5810d 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -506,10 +506,13 @@ bool IsUnaryElementWise(const NodeDef& node) {
           "Atan2",
           "Atanh",
           "Ceil",
+          "CheckNumerics",
           "ComplexAbs",
           "Conj",
           "Cos",
           "Cosh",
+          "DebugGradientIdentity",
+          "DeepCopy"
           "Digamma",
           "Elu"
           "Erf",
@@ -517,36 +520,37 @@ bool IsUnaryElementWise(const NodeDef& node) {
           "Exp",
           "Expm1",
           "Floor",
+          "Identity",
           "Inv",
           "Invert",
-          "Isinf",
-          "Isnan",
-          "Isfinite",
           "Lgamma",
           "Log",
           "Log1p",
           "LogicalNot",
           "Neg",
+          "PreventGradient",
+          "Print",
           "Reciprocal",
           "Relu",
           "Relu6",
           "Rint",
           "Round",
-          "Selu",
           "Rsqrt",
+          "Selu",
           "Sigmoid",
           "Sign",
           "Sin",
           "SinH",
+          "Snapshot",
           "Softplus",
           "Softsign",
           "Sqrt",
           "Square",
+          "StopGradient",
           "Tan"
           "Tanh",
       }));
-  return element_wise_ops->count(node.op()) > 0 ||
-         (!IsIdentityN(node) && IsValueAndOrderPreserving(node));
+  return element_wise_ops->find(node.op()) != element_wise_ops->end();
 }
 
 bool HasOpDef(const NodeDef& node) {
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 866b993e93..65b3bad64d 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -303,6 +303,11 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
     }
   }
 
+  bool IsInPreserveSet(const NodeDef& node) const {
+    return ctx().nodes_to_preserve->find(node.name()) !=
+           ctx().nodes_to_preserve->end();
+  }
+
  private:
   // Extended context required for ArithmeticOptimizer.
   const ArithmeticOptimizerContext ctx_ext_;
@@ -475,11 +480,6 @@ class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
     return group.root_node->device() == node.device();
   }
 
-  bool IsInPreserveSet(const NodeDef& node) const {
-    return ctx().nodes_to_preserve->find(node.name()) !=
-           ctx().nodes_to_preserve->end();
-  }
-
   bool IsAlreadyOptimized(const NodeDef& node) const {
     return optimized_nodes_.find(node.name()) != optimized_nodes_.end();
   }
@@ -1346,6 +1346,7 @@ class RemoveNegationStage : public ArithmeticOptimizerStage {
 // Exp(Sin(Concat([x, y, z]))).
 // TODO(rmlarsen): Support casting. We would have to change the type attribute
 // on the concat node.
+// TODO(rmlarsen): Handle Enter/Exit.
 class HoistCWiseUnaryFromConcatStage : public ArithmeticOptimizerStage {
  public:
   explicit HoistCWiseUnaryFromConcatStage(
@@ -1356,7 +1357,9 @@ class HoistCWiseUnaryFromConcatStage : public ArithmeticOptimizerStage {
   ~HoistCWiseUnaryFromConcatStage() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
-    if (!IsConcat(*node)) return false;
+    if (!IsConcat(*node) || IsInPreserveSet(*node)) {
+      return false;
+    }
     const int n = node->attr().at("N").i();
     return n > 1;
   }
@@ -1368,6 +1371,11 @@ class HoistCWiseUnaryFromConcatStage : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(
         FindCommonUnaryOpPrefix(*concat_node, &prefix_length, &ctrl_inputs));
     if (prefix_length > 0) {
+      LOG(INFO) << "Found prefix of length " << prefix_length << " for node:\n"
+                << concat_node->DebugString();
+      for (auto foo : ctrl_inputs) {
+        LOG(INFO) << "ctrl_input = " << foo;
+      }
       TF_RETURN_IF_ERROR(
           HoistUnaryOpPrefix(prefix_length, &ctrl_inputs, concat_node));
       AddToOptimizationQueue(concat_node);
@@ -1413,6 +1421,7 @@ class HoistCWiseUnaryFromConcatStage : public ArithmeticOptimizerStage {
                                  std::set<string>* ctrl_inputs) const {
     *prefix_length = 0;
     const int n = concat_node.attr().at("N").i();
+    const string& concat_device = concat_node.device();
     // Follow the chains backwards from each concat input as long as all the
     // following conditions hold:
     //   1. The ops in all chains are the same.
@@ -1438,8 +1447,10 @@ class HoistCWiseUnaryFromConcatStage : public ArithmeticOptimizerStage {
       if (!IsUnaryElementWise(*tail0)) break;
       for (int chain = 0; chain < n; ++chain) {
         // TODO(rmlarsen): Allow and hoist outgoing control edges.
-        if (tail[chain]->op() != tail0->op() ||
-            ctx().node_map->GetOutputs(tail[chain]->name()).size() > 1) {
+        if (tail[chain]->device() != concat_device ||
+            tail[chain]->op() != tail0->op() ||
+            ctx().node_map->GetOutputs(tail[chain]->name()).size() > 1 ||
+            IsInPreserveSet(*tail[chain])) {
           stop = true;
           break;
         }
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 375f13acc1..c299bd030b 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -65,16 +65,13 @@ class ArithmeticOptimizer : public GraphOptimizer {
     bool remove_redundant_bitcast = true;
     bool remove_redundant_cast = true;
     bool remove_negation = true;
-    bool hoist_unary_out_of_concat = false;
+    bool hoist_unary_out_of_concat = true;
 
     // Choose which arithmetic optimizer stages will be enabled for a given
     // optimization level by default.
     static ArithmeticOptimizerOptions Default(
         RewriterConfig::Toggle opt_level) {
       ArithmeticOptimizerOptions options;
-      if (opt_level == RewriterConfig::AGGRESSIVE) {
-        options.hoist_unary_out_of_concat = true;
-      }
       return options;
     }
   };
-- 
GitLab


From e767622c35766c3f9282574f63846c68a0b02248 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Wed, 25 Apr 2018 10:41:05 -0700
Subject: [PATCH 413/450] Add eager_service.proto

PiperOrigin-RevId: 194262260
---
 tensorflow/core/BUILD                        |  12 ++
 tensorflow/core/protobuf/eager_service.proto | 158 +++++++++++++++++++
 2 files changed, 170 insertions(+)
 create mode 100644 tensorflow/core/protobuf/eager_service.proto

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index e8f10f148d..c1cc861ef0 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1631,6 +1631,18 @@ tf_proto_library_cc(
     ],
 )
 
+tf_proto_library_cc(
+    name = "eager_service_proto",
+    srcs = ["protobuf/eager_service.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_stubby_versions = ["2"],
+    protodeps = tf_additional_all_protos(),
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
 LIB_INTERNAL_PRIVATE_HEADERS = ["framework/resource_handle.h"] + glob(
     [
         "lib/**/*.h",
diff --git a/tensorflow/core/protobuf/eager_service.proto b/tensorflow/core/protobuf/eager_service.proto
new file mode 100644
index 0000000000..c2325cc803
--- /dev/null
+++ b/tensorflow/core/protobuf/eager_service.proto
@@ -0,0 +1,158 @@
+syntax = "proto3";
+
+package tensorflow.eager;
+
+import "tensorflow/core/framework/attr_value.proto";
+import "tensorflow/core/framework/device_attributes.proto";
+import "tensorflow/core/framework/function.proto";
+import "tensorflow/core/framework/versions.proto";
+import "tensorflow/core/protobuf/tensorflow_server.proto";
+
+message RemoteTensorHandle {
+  // The ID of the operation that produced this tensor.
+  int64 op_id = 1;
+  // The index into the outputs of the operation that produced this tensor.
+  int32 output_num = 2;
+}
+
+// A proto representation of an eager operation.
+message Operation {
+  // A unique identifier for the operation. Set by the client so that the client
+  // can uniquely identify the outputs of the scheduled operation.
+  //
+  // In the initial implementation, sending duplicate IDs has undefined
+  // behaviour, but additional constraints may be placed upon this in the
+  // future.
+  int64 id = 1;
+  string name = 2;
+  repeated RemoteTensorHandle inputs = 3;
+
+  // Control Operation IDs that will be respected when ops are re-ordered by
+  // async execution. If async execution (+ op re-ordering) is not enabled, this
+  // should have no effect.
+  repeated int64 control_op_ids = 4;
+  map<string, AttrValue> attrs = 5;
+  string device = 6;
+}
+
+message QueueItem {
+  // The remote executor should be able to handle either executing ops directly,
+  // or releasing any unused tensor handles, since the tensor lifetime is
+  // maintained by the client.
+  oneof item {
+    RemoteTensorHandle handle_to_decref = 1;
+    Operation operation = 2;
+  }
+}
+
+message CreateContextRequest {
+  // Identifies the full cluster, and this particular worker's position within.
+  ServerDef server_def = 1;
+
+  // Whether the ops on the worker should be executed synchronously or
+  // asynchronously. By default, ops are executed synchronously.
+  bool async = 2;
+
+  // Number of seconds to keep the context alive. If more than keep_alive_secs
+  // has passed since a particular context has been communicated with, it will
+  // be garbage collected.
+  int64 keep_alive_secs = 3;
+
+  // This is the version for all the ops that will be enqueued by the client.
+  VersionDef version_def = 4;
+}
+
+message CreateContextResponse {
+  // The ID of the created context. This is usually a randomly generated number,
+  // that will be used to identify the context in future requests to the
+  // service. Contexts are not persisted through server restarts.
+  fixed64 context_id = 1;
+
+  // List of devices that are locally accessible to the worker.
+  repeated DeviceAttributes device_attributes = 2;
+}
+
+message EnqueueRequest {
+  fixed64 context_id = 1;
+
+  repeated QueueItem queue = 3;
+}
+
+message EnqueueResponse {
+}
+
+message WaitQueueDoneRequest {
+  fixed64 context_id = 1;
+
+  // Ids to wait on. If empty, wait on everything currently pending.
+  repeated int64 op_id = 2;
+}
+
+message WaitQueueDoneResponse {
+  // TODO(nareshmodi): Consider adding NodeExecStats here to be able to
+  // propagate some stats.
+}
+
+message KeepAliveRequest {
+  fixed64 context_id = 1;
+}
+
+message KeepAliveResponse {
+}
+
+message CloseContextRequest {
+  fixed64 context_id = 1;
+}
+
+message CloseContextResponse {
+}
+
+message RegisterFunctionRequest {
+  fixed64 context_id = 1;
+
+  FunctionDef function_def = 2;
+}
+
+message RegisterFunctionResponse {
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Eager Service defines a TensorFlow service that executes operations eagerly
+// on a set of local devices, on behalf of a remote Eager executor.
+//
+// The service impl will keep track of the various peers and devices it has
+// access to and allows the client to enqueue ops on any devices that it is able
+// to access and schedule data transfers from/to any of the peers.
+//
+////////////////////////////////////////////////////////////////////////////////
+service EagerService {
+  // This initializes the worker, informing it about the other workers in the
+  // cluster and exchanging authentication tokens which will be used in all
+  // other RPCs to detect whether the worker has restarted.
+  rpc CreateContext(CreateContextRequest) returns (CreateContextResponse);
+
+  // This takes a list of Execute and DeleteTensorHandle operations and enqueues
+  // (in async mode) or executes (in sync mode) them on the remote server.
+  // All outputs of ops which were not explicitly deleted with
+  // DeleteTensorHandle entries will be assumed to be alive and are usable by
+  // future calls to Enqueue.
+  rpc Enqueue(EnqueueRequest) returns (EnqueueResponse);
+
+  // Takes a set of op IDs and waits until those ops are done. Returns any error
+  // in the stream so far.
+  rpc WaitQueueDone(WaitQueueDoneRequest) returns (WaitQueueDoneResponse);
+
+  // Contexts are always created with a deadline and no RPCs within a deadline
+  // will trigger a context garbage collection. KeepAlive calls can be used to
+  // delay this.
+  rpc KeepAlive(KeepAliveRequest) returns (KeepAliveResponse);
+
+  // Closes the context. No calls to other methods using the existing context ID
+  // are valid after this.
+  rpc CloseContext(CloseContextRequest) returns (CloseContextResponse);
+
+  // Takes a FunctionDef and makes it enqueable on the remote worker.
+  rpc RegisterFunction(RegisterFunctionRequest)
+      returns (RegisterFunctionResponse);
+}
-- 
GitLab


From 315ea39039a080540ac5570bb4069abe313291ff Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Wed, 25 Apr 2018 10:49:30 -0700
Subject: [PATCH 414/450] Adding the h5py dependency in devel docker files.

---
 tensorflow/tools/docker/Dockerfile.devel     | 2 ++
 tensorflow/tools/docker/Dockerfile.devel-gpu | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 5c49ac1d8d..ba8bd4f3f5 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -28,6 +28,8 @@ RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
     rm get-pip.py
 
 RUN pip --no-cache-dir install \
+        Pillow \
+        h5py \
         ipykernel \
         jupyter \
         matplotlib \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 196227861b..e747deb61c 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -37,6 +37,8 @@ RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
     rm get-pip.py
 
 RUN pip --no-cache-dir install \
+        Pillow \
+        h5py \
         ipykernel \
         jupyter \
         matplotlib \
-- 
GitLab


From bae4221c6cede224074c842f14fd5985ac18e2e2 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Wed, 25 Apr 2018 10:57:06 -0700
Subject: [PATCH 415/450] Adding install for libhdf5-serial-dev.

---
 tensorflow/tools/docker/Dockerfile           | 1 +
 tensorflow/tools/docker/Dockerfile.devel     | 1 +
 tensorflow/tools/docker/Dockerfile.devel-gpu | 1 +
 tensorflow/tools/docker/Dockerfile.gpu       | 1 +
 4 files changed, 4 insertions(+)

diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
index 78cb4d250e..a3ff8211e3 100644
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@@ -7,6 +7,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         curl \
         libfreetype6-dev \
+        libhdf5-serial-dev \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index ba8bd4f3f5..b9996395d0 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -8,6 +8,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         git \
         libcurl3-dev \
         libfreetype6-dev \
+        libhdf5-serial-dev \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index e747deb61c..7e5e6ef2d5 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -17,6 +17,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcudnn7-dev=7.0.5.15-1+cuda9.0 \
         libcurl3-dev \
         libfreetype6-dev \
+        libhdf5-serial-dev \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 9e1708662e..bff4a20392 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -14,6 +14,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         curl \
         libcudnn7=7.0.5.15-1+cuda9.0 \
         libfreetype6-dev \
+        libhdf5-serial-dev \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
-- 
GitLab


From c49f77fe54e728b62c75f7fbe151abe7565d3e5c Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Wed, 25 Apr 2018 10:58:13 -0700
Subject: [PATCH 416/450] Delete from_stream_executor_status.h.

It's dead code.

PiperOrigin-RevId: 194265380
---
 .../default/from_stream_executor_status.h     | 32 -------------------
 tensorflow/core/platform/stream_executor.h    |  2 --
 .../core/platform/stream_executor_no_cuda.h   |  2 --
 3 files changed, 36 deletions(-)
 delete mode 100644 tensorflow/core/platform/default/from_stream_executor_status.h

diff --git a/tensorflow/core/platform/default/from_stream_executor_status.h b/tensorflow/core/platform/default/from_stream_executor_status.h
deleted file mode 100644
index 93d2f28ca8..0000000000
--- a/tensorflow/core/platform/default/from_stream_executor_status.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_PLATFORM_DEFAULT_FROM_STREAM_EXECUTOR_STATUS_H_
-#define TENSORFLOW_PLATFORM_DEFAULT_FROM_STREAM_EXECUTOR_STATUS_H_
-
-// IWYU pragma: private, include "third_party/tensorflow/core/platform/from_stream_executor_status.h"
-// IWYU pragma: friend third_party/tensorflow/core/platform/from_stream_executor_status.h
-
-#include "tensorflow/stream_executor/lib/status.h"
-
-namespace tensorflow {
-
-// On the open-source platform, stream_executor currently uses
-// tensorflow::Status
-inline Status FromStreamExecutorStatus(const se::port::Status& s) { return s; }
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_PLATFORM_DEFAULT_FROM_STREAM_EXECUTOR_STATUS_H_
diff --git a/tensorflow/core/platform/stream_executor.h b/tensorflow/core/platform/stream_executor.h
index 006184ddef..0a590b3d40 100644
--- a/tensorflow/core/platform/stream_executor.h
+++ b/tensorflow/core/platform/stream_executor.h
@@ -19,10 +19,8 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"
 
 #if defined(PLATFORM_GOOGLE)
-#include "tensorflow/core/platform/google/from_stream_executor_status.h"
 #include "tensorflow/stream_executor/platform/google/dso_loader.h"
 #else
-#include "tensorflow/core/platform/default/from_stream_executor_status.h"
 #include "tensorflow/stream_executor/dso_loader.h"
 #endif
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
diff --git a/tensorflow/core/platform/stream_executor_no_cuda.h b/tensorflow/core/platform/stream_executor_no_cuda.h
index 4a41d7adf5..50a5e732c0 100644
--- a/tensorflow/core/platform/stream_executor_no_cuda.h
+++ b/tensorflow/core/platform/stream_executor_no_cuda.h
@@ -19,10 +19,8 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"
 
 #if defined(PLATFORM_GOOGLE)
-#include "tensorflow/core/platform/google/from_stream_executor_status.h"
 #include "tensorflow/stream_executor/platform/google/dso_loader.h"
 #else
-#include "tensorflow/core/platform/default/from_stream_executor_status.h"
 #include "tensorflow/stream_executor/dso_loader.h"
 #endif
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
-- 
GitLab


From 1ec0ffba6be70de13eebad4c0e6b94856fae2d7d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 11:14:49 -0700
Subject: [PATCH 417/450] Enabled TENSORFLOW_USE_ABSL for mobile.
 TENSORFLOW_USE_ABSL is now enabled in all cases.

This lets TENSORFLOW_USE_ABSL be removed (as it will always be true), and is necessary to migrate TensorFlow to absl.

PiperOrigin-RevId: 194268101
---
 tensorflow/core/platform/default/build_config.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index ca0587e277..ec3bbab1fd 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -575,11 +575,11 @@ def tf_additional_lib_defines():
       "//tensorflow:with_jemalloc_linux_x86_64": ["TENSORFLOW_USE_JEMALLOC"],
       "//tensorflow:with_jemalloc_linux_ppc64le":["TENSORFLOW_USE_JEMALLOC"],
       "//conditions:default": [],
-  }) + if_not_mobile(["TENSORFLOW_USE_ABSL"])
+  }) + ["TENSORFLOW_USE_ABSL"]
 
 def tf_additional_lib_deps():
   """Additional dependencies needed to build TF libraries."""
-  return if_not_mobile(["@com_google_absl//absl/base:base"]) + if_static(
+  return ["@com_google_absl//absl/base:base"] + if_static(
       ["@nsync//:nsync_cpp"],
       ["@nsync//:nsync_headers"]
   ) + select({
-- 
GitLab


From aa2c22c7fb47b4f042e2e7f75460d2b8bd9db961 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 11:22:37 -0700
Subject: [PATCH 418/450] [TF:XLA] Fix entry computation layout assignment in
 the HLO parser

For tuple shapes, the layouts were not correctly attached to the module.

PiperOrigin-RevId: 194269675
---
 tensorflow/compiler/xla/shape_layout.h           |  3 +--
 .../compiler/xla/tools/parser/hlo_parser.cc      | 16 ++++++----------
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/tensorflow/compiler/xla/shape_layout.h b/tensorflow/compiler/xla/shape_layout.h
index 4c83750f3e..a1dce758cd 100644
--- a/tensorflow/compiler/xla/shape_layout.h
+++ b/tensorflow/compiler/xla/shape_layout.h
@@ -48,8 +48,7 @@ class ShapeLayout {
   bool MatchesLayoutInShape(const Shape& shape) const;
 
   // Copies the layout from the given shape into this ShapeLayout. 'other_shape'
-  // must be compatible with the ShapeLayout's shape, and 'other_shape' must
-  // have a layout (LayoutUtil::HasLayout).
+  // must be compatible with the ShapeLayout's shape.
   tensorflow::Status CopyLayoutFromShape(const Shape& other_shape);
 
   // Clears (Layout::Clear) all the Layouts stored in this object.
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index 95d3fd28b3..fdbfc0210e 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -303,18 +303,14 @@ bool HloParser::ParseComputations() {
     // set the layouts to what the hlo text says.
     for (int p = 0; p < computation->num_parameters(); p++) {
       const Shape& param_shape = computation->parameter_instruction(p)->shape();
-      if (param_shape.has_layout()) {
-        module_->mutable_entry_computation_layout()
-            ->mutable_parameter_layout(p)
-            ->ResetLayout(param_shape.layout());
-      }
+      TF_CHECK_OK(module_->mutable_entry_computation_layout()
+                      ->mutable_parameter_layout(p)
+                      ->CopyLayoutFromShape(param_shape));
     }
     const Shape& result_shape = computation->root_instruction()->shape();
-    if (result_shape.has_layout()) {
-      module_->mutable_entry_computation_layout()
-          ->mutable_result_layout()
-          ->ResetLayout(result_shape.layout());
-    }
+    TF_CHECK_OK(module_->mutable_entry_computation_layout()
+                    ->mutable_result_layout()
+                    ->CopyLayoutFromShape(result_shape));
   }
 
   return true;
-- 
GitLab


From e959f7d25e5218b54172d1590fcd3d1d23d7eaf3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 11:25:10 -0700
Subject: [PATCH 419/450] Serialize strings properly when using TOCO for model
 conversion.

PiperOrigin-RevId: 194270132
---
 tensorflow/contrib/lite/string_util.cc        | 45 +++++++++++++------
 tensorflow/contrib/lite/string_util.h         |  8 +++-
 tensorflow/contrib/lite/toco/tflite/BUILD     |  1 +
 tensorflow/contrib/lite/toco/tflite/types.cc  | 33 +++++++++++++-
 .../contrib/lite/toco/tflite/types_test.cc    |  7 +++
 5 files changed, 77 insertions(+), 17 deletions(-)

diff --git a/tensorflow/contrib/lite/string_util.cc b/tensorflow/contrib/lite/string_util.cc
index cd41299d38..a89776b29f 100644
--- a/tensorflow/contrib/lite/string_util.cc
+++ b/tensorflow/contrib/lite/string_util.cc
@@ -24,7 +24,10 @@ namespace tflite {
 namespace {
 
 // Convenient method to get pointer to int32_t.
-int32_t* GetIntPtr(char* ptr) { return reinterpret_cast<int32_t*>(ptr); }
+const int32_t* GetIntPtr(const char* ptr) {
+  return reinterpret_cast<const int32_t*>(ptr);
+}
+
 }  // namespace
 
 void DynamicBuffer::AddString(const char* str, size_t len) {
@@ -64,7 +67,7 @@ void DynamicBuffer::AddJoinedString(const std::vector<StringRef>& strings,
   offset_.push_back(offset_.back() + total_len);
 }
 
-void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor) {
+int DynamicBuffer::WriteToBuffer(char** buffer) {
   // Allocate sufficient memory to tensor buffer.
   int32_t num_strings = offset_.size() - 1;
   // Total bytes include:
@@ -75,43 +78,57 @@ void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor) {
   int32_t bytes = data_.size()                            // size of content
                   + sizeof(int32_t) * (num_strings + 2);  // size of header
 
-  // Output tensor will take over the ownership of tensor_buffer, and free it
-  // during Interpreter destruction.
-  char* tensor_buffer = static_cast<char*>(malloc(bytes));
+  // Caller will take ownership of buffer.
+  *buffer = reinterpret_cast<char*>(malloc(bytes));
 
   // Set num of string
-  memcpy(tensor_buffer, &num_strings, sizeof(int32_t));
+  memcpy(*buffer, &num_strings, sizeof(int32_t));
 
   // Set offset of strings.
   int32_t start = sizeof(int32_t) * (num_strings + 2);
   for (int i = 0; i < offset_.size(); i++) {
     int32_t offset = start + offset_[i];
-    memcpy(tensor_buffer + sizeof(int32_t) * (i + 1), &offset, sizeof(int32_t));
+    memcpy(*buffer + sizeof(int32_t) * (i + 1), &offset, sizeof(int32_t));
   }
 
   // Copy data of strings.
-  memcpy(tensor_buffer + start, data_.data(), data_.size());
+  memcpy(*buffer + start, data_.data(), data_.size());
+  return bytes;
+}
+
+void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor) {
+  char* tensor_buffer;
+  int bytes = WriteToBuffer(&tensor_buffer);
 
   // Set tensor content pointer to tensor_buffer, and release original data.
   auto dims = TfLiteIntArrayCreate(1);
-  dims->data[0] = num_strings;
+  dims->data[0] = offset_.size() - 1;  // Store number of strings.
   TfLiteTensorReset(tensor->type, tensor->name, dims, tensor->params,
                     tensor_buffer, bytes, kTfLiteDynamic, tensor->allocation,
                     tensor);
 }
 
+int GetStringCount(const char* raw_buffer) {
+  // The first integers in the raw buffer is the number of strings.
+  return *GetIntPtr(raw_buffer);
+}
+
 int GetStringCount(const TfLiteTensor* tensor) {
   // The first integers in the raw buffer is the number of strings.
-  return *GetIntPtr(tensor->data.raw);
+  return GetStringCount(tensor->data.raw);
 }
 
-StringRef GetString(const TfLiteTensor* tensor, int string_index) {
-  int32_t* offset =
-      GetIntPtr(tensor->data.raw + sizeof(int32_t) * (string_index + 1));
+StringRef GetString(const char* raw_buffer, int string_index) {
+  const int32_t* offset =
+      GetIntPtr(raw_buffer + sizeof(int32_t) * (string_index + 1));
   return {
-      tensor->data.raw + (*offset),
+      raw_buffer + (*offset),
       (*(offset + 1)) - (*offset),
   };
 }
 
+StringRef GetString(const TfLiteTensor* tensor, int string_index) {
+  return GetString(tensor->data.raw, string_index);
+}
+
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/string_util.h b/tensorflow/contrib/lite/string_util.h
index c35a2fff3c..57f129bf5e 100644
--- a/tensorflow/contrib/lite/string_util.h
+++ b/tensorflow/contrib/lite/string_util.h
@@ -49,7 +49,7 @@ namespace tflite {
 
 // Convenient structure to store string pointer and length.
 typedef struct {
-  char* str;
+  const char* str;
   int len;
 } StringRef;
 
@@ -70,6 +70,10 @@ class DynamicBuffer {
   // buffer.
   void AddJoinedString(const std::vector<StringRef>& strings, char separator);
 
+  // Fill content into a buffer and returns the number of bytes stored.
+  // The function allocates space for the buffer but does NOT take ownership.
+  int WriteToBuffer(char** buffer);
+
   // Fill content into a string tensor.
   void WriteToTensor(TfLiteTensor* tensor);
 
@@ -81,10 +85,12 @@ class DynamicBuffer {
 };
 
 // Return num of strings in a String tensor.
+int GetStringCount(const char* raw_buffer);
 int GetStringCount(const TfLiteTensor* tensor);
 
 // Get String pointer and length of index-th string in tensor.
 // NOTE: This will not create a copy of string data.
+StringRef GetString(const char* raw_buffer, int string_index);
 StringRef GetString(const TfLiteTensor* tensor, int string_index);
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/toco/tflite/BUILD b/tensorflow/contrib/lite/toco/tflite/BUILD
index e0191801a0..e1025c6664 100644
--- a/tensorflow/contrib/lite/toco/tflite/BUILD
+++ b/tensorflow/contrib/lite/toco/tflite/BUILD
@@ -54,6 +54,7 @@ cc_library(
         "types.h",
     ],
     deps = [
+        "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/schema:schema_fbs",
         "//tensorflow/contrib/lite/toco:model",
     ],
diff --git a/tensorflow/contrib/lite/toco/tflite/types.cc b/tensorflow/contrib/lite/toco/tflite/types.cc
index 0afd2f3df5..c9c2e9ba01 100644
--- a/tensorflow/contrib/lite/toco/tflite/types.cc
+++ b/tensorflow/contrib/lite/toco/tflite/types.cc
@@ -13,12 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/toco/tflite/types.h"
+#include "tensorflow/contrib/lite/string_util.h"
 
 namespace toco {
 
 namespace tflite {
 
 namespace {
+
+DataBuffer::FlatBufferOffset CopyStringToBuffer(
+    const Array& array, flatbuffers::FlatBufferBuilder* builder) {
+  const auto& src_data = array.GetBuffer<ArrayDataType::kString>().data;
+  ::tflite::DynamicBuffer dyn_buffer;
+  for (const string& str : src_data) {
+    dyn_buffer.AddString(str.c_str(), str.length());
+  }
+  char* tensor_buffer;
+  int bytes = dyn_buffer.WriteToBuffer(&tensor_buffer);
+  std::vector<uint8_t> dst_data(bytes);
+  memcpy(dst_data.data(), tensor_buffer, bytes);
+  free(tensor_buffer);
+  return builder->CreateVector(dst_data.data(), bytes);
+}
+
 template <ArrayDataType T>
 DataBuffer::FlatBufferOffset CopyBuffer(
     const Array& array, flatbuffers::FlatBufferBuilder* builder) {
@@ -29,6 +46,18 @@ DataBuffer::FlatBufferOffset CopyBuffer(
   return builder->CreateVector(dst_data, size);
 }
 
+void CopyStringFromBuffer(const ::tflite::Buffer& buffer, Array* array) {
+  auto* src_data = reinterpret_cast<const char*>(buffer.data()->data());
+  std::vector<string>* dst_data =
+      &array->GetMutableBuffer<ArrayDataType::kString>().data;
+  int32_t num_strings = ::tflite::GetStringCount(src_data);
+  for (int i = 0; i < num_strings; i++) {
+    ::tflite::StringRef str_ref = ::tflite::GetString(src_data, i);
+    string this_str(str_ref.str, str_ref.len);
+    dst_data->push_back(this_str);
+  }
+}
+
 template <ArrayDataType T>
 void CopyBuffer(const ::tflite::Buffer& buffer, Array* array) {
   using NativeT = ::toco::DataType<T>;
@@ -93,7 +122,7 @@ flatbuffers::Offset<flatbuffers::Vector<uint8_t>> DataBuffer::Serialize(
     case ArrayDataType::kInt64:
       return CopyBuffer<ArrayDataType::kInt64>(array, builder);
     case ArrayDataType::kString:
-      return CopyBuffer<ArrayDataType::kString>(array, builder);
+      return CopyStringToBuffer(array, builder);
     case ArrayDataType::kUint8:
       return CopyBuffer<ArrayDataType::kUint8>(array, builder);
     default:
@@ -114,7 +143,7 @@ void DataBuffer::Deserialize(const ::tflite::Tensor& tensor,
     case ::tflite::TensorType_INT64:
       return CopyBuffer<ArrayDataType::kInt64>(buffer, array);
     case ::tflite::TensorType_STRING:
-      return CopyBuffer<ArrayDataType::kString>(buffer, array);
+      return CopyStringFromBuffer(buffer, array);
     case ::tflite::TensorType_UINT8:
       return CopyBuffer<ArrayDataType::kUint8>(buffer, array);
     default:
diff --git a/tensorflow/contrib/lite/toco/tflite/types_test.cc b/tensorflow/contrib/lite/toco/tflite/types_test.cc
index a040fe1358..29fb0b2af2 100644
--- a/tensorflow/contrib/lite/toco/tflite/types_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/types_test.cc
@@ -151,6 +151,13 @@ TEST(DataBuffer, Int32) {
               ::testing::ElementsAre(1, 1 << 30));
 }
 
+TEST(DataBuffer, String) {
+  Array recovered = ToFlatBufferAndBack<ArrayDataType::kString>(
+      {"AA", "BBB", "Best. String. Ever."});
+  EXPECT_THAT(recovered.GetBuffer<ArrayDataType::kString>().data,
+              ::testing::ElementsAre("AA", "BBB", "Best. String. Ever."));
+}
+
 TEST(Padding, All) {
   EXPECT_EQ(::tflite::Padding_SAME, Padding::Serialize(PaddingType::kSame));
   EXPECT_EQ(PaddingType::kSame, Padding::Deserialize(::tflite::Padding_SAME));
-- 
GitLab


From 12bf1e33e7138101e99eedf3f609f70df2fef160 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Wed, 25 Apr 2018 11:47:01 -0700
Subject: [PATCH 420/450] Set up a basic README with a pointer to the dev
 summit colab.

PiperOrigin-RevId: 194273919
---
 tensorflow/contrib/autograph/README.md | 119 ++++++++++++++++++++++++-
 1 file changed, 116 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/autograph/README.md b/tensorflow/contrib/autograph/README.md
index 7e84f237dc..0fcbf5dd59 100644
--- a/tensorflow/contrib/autograph/README.md
+++ b/tensorflow/contrib/autograph/README.md
@@ -1,4 +1,117 @@
-# Autograph
+# AutoGraph
 
-A compiler for generating TensorFlow numeric and control flow ops from Python
-code.
+IMPORTANT: AutoGraph is pre-alpha, under active development. Expect rough edges and bugs, but if you try it, we appreciate early feedback!
+
+AutoGraph is a Python to TensorFlow compiler.
+
+With AutoGraph, you can write [Eager style](https://www.tensorflow.org/programmers_guide/eager) code in a concise manner, and run it as a TensorFlow graph. AutoGraph uses source code transformation and partial evaluation to generate Python code that builds an equivalent TensorFlow subgraph. The result is code that behaves like ops and can be freely combined with other TensorFlow ops.
+
+For example, this Python function:
+
+```
+def f(x):
+  if x < 0:
+    x = -x
+  return x
+```
+
+would be converted to this:
+
+```
+def graph_mode_f(x):
+  with tf.name_scope('f'):
+
+    def if_true():
+      with tf.name_scope('if_true'):
+        x_1, = x,
+        x_1 = tf.negative(x_1)
+        return x_1,
+
+    def if_false():
+      with tf.name_scope('if_false'):
+        x_1, = x,
+        return x_1,
+    x = ag__.utils.run_cond(tf.greater(x, 0), if_true, if_false)
+    return x
+```
+
+so you can use it like an op:
+
+```
+with tf.Graph().as_default():
+  x = tf.constant(-1.0)
+
+  converted_f = autograph.to_graph(f)
+  y = converted_f(x)
+
+  with tf.Session() as sess:
+    print(sess.run(y))
+    # Output: 1
+```
+
+# Getting started
+
+Use AutoGraph in one of the following ways, described below:
+
+ 1. Annotations (simpler)
+ 2. Functional API (more flexible)
+
+NOTE: You can find more examples in this [interactive notebook](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb).
+
+To get started, install the latest nightly TensorFlow build:
+
+```shell
+pip install -U tf-nightly
+```
+
+Then import the `autograph` module from `tf.contrib`:
+
+```
+from tensorflow.contrib import autograph as ag
+```
+
+## Using with annotations
+
+Annotating a function or class with `@convert` converts it in place:
+
+```
+@ag.convert()
+def f(x):
+  if x < 0:
+    x = -x
+  return x
+```
+
+... so that it always outputs TensorFlow code:
+
+```
+with tf.Graph().as_default():
+  x = tf.constant(-1)
+
+  y = f(x)
+
+  with tf.Session() as sess:
+    print(sess.run(y))
+    # Output: 1
+```
+
+## Using the functional API
+
+The functional API allows you to convert an existing function, class or object after it was defined:
+
+```
+converted_f = ag.to_graph(f)
+
+print(converted_f(tf.constant(-1)))
+# Output: Tensor
+
+print(f(-1))
+# Output: 1
+```
+
+You can use the functional API to inspect the generated code as well:
+
+```
+print(ag.to_code(f))
+# Output: <Python and TensorFlow code>
+```
-- 
GitLab


From 3563165fb8328fd0fd6eba88557c25209453339f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 11:48:34 -0700
Subject: [PATCH 421/450] Disable nasnet_test on MSAN due to flaky timeout.

PiperOrigin-RevId: 194274190
---
 tensorflow/python/keras/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 1c58553156..a14a121b6e 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -382,6 +382,7 @@ py_test(
     size = "large",
     srcs = ["_impl/keras/applications/nasnet_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["nomsan"],  # times out, http://b/78573625
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
-- 
GitLab


From fe965a8502c1a6667ab209dfbfd8b84a6bfb45ee Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Wed, 25 Apr 2018 11:52:08 -0700
Subject: [PATCH 422/450] Removing remove_undocumented calls from
 tensorflow/python.

PiperOrigin-RevId: 194274698
---
 tensorflow/contrib/framework/__init__.py      |  18 +-
 .../meta_graph_transform.py                   |   5 +-
 tensorflow/python/__init__.py                 | 166 --------------
 tensorflow/python/data/__init__.py            |   3 -
 tensorflow/python/data/util/nest.py           |  14 --
 tensorflow/python/estimator/estimator_lib.py  |  41 ----
 .../python/estimator/export/export_lib.py     |  13 --
 tensorflow/python/estimator/inputs/inputs.py  |   8 -
 .../feature_column/feature_column_lib.py      |  21 --
 tensorflow/python/framework/errors.py         |  46 ----
 tensorflow/python/framework/graph_util.py     |  11 -
 tensorflow/python/layers/layers.py            |   6 -
 tensorflow/python/lib/io/python_io.py         |   5 -
 tensorflow/python/ops/bitwise_ops.py          |   3 -
 .../python/ops/distributions/bijector.py      |   5 -
 .../python/ops/distributions/distributions.py |  26 ---
 .../distributions/transformed_distribution.py |   2 -
 tensorflow/python/ops/gradients.py            |  10 -
 tensorflow/python/ops/image_ops.py            |  10 -
 tensorflow/python/ops/losses/losses.py        |   9 -
 tensorflow/python/ops/manip_ops.py            |   5 -
 tensorflow/python/ops/metrics.py              |   5 -
 tensorflow/python/ops/nn.py                   |  20 --
 tensorflow/python/ops/rnn_cell.py             |   5 -
 tensorflow/python/ops/sdca_ops.py             |   5 -
 tensorflow/python/ops/sets.py                 |   5 -
 tensorflow/python/ops/spectral_ops.py         |   3 -
 tensorflow/python/ops/standard_ops.py         | 210 ------------------
 tensorflow/python/platform/app.py             |   9 -
 tensorflow/python/platform/gfile.py           |  22 --
 tensorflow/python/platform/resource_loader.py |   5 -
 tensorflow/python/platform/sysconfig.py       |   4 -
 tensorflow/python/platform/test.py            |  11 -
 tensorflow/python/platform/tf_logging.py      |  28 ---
 tensorflow/python/profiler/profiler.py        |   9 -
 tensorflow/python/saved_model/builder.py      |   7 -
 tensorflow/python/saved_model/constants.py    |  15 --
 tensorflow/python/saved_model/loader.py       |   8 -
 tensorflow/python/saved_model/main_op.py      |   7 -
 tensorflow/python/saved_model/saved_model.py  |  15 --
 .../python/saved_model/signature_constants.py |  17 --
 .../python/saved_model/tag_constants.py       |   9 -
 tensorflow/python/saved_model/utils.py        |   4 -
 tensorflow/python/summary/summary.py          |   8 -
 tensorflow/python/training/queue_runner.py    |  10 -
 tensorflow/python/training/training.py        |  46 ----
 tensorflow/python/util/compat.py              |  11 -
 tensorflow/python/util/nest.py                |  19 --
 48 files changed, 21 insertions(+), 923 deletions(-)

diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 11397e86bd..10d1ecc738 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -108,6 +108,7 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,wildcard-import
 from tensorflow.contrib.framework.python.framework import *
+from tensorflow.contrib.framework.python.framework import nest
 from tensorflow.contrib.framework.python.ops import *
 # pylint: enable=unused-import,wildcard-import
 
@@ -126,5 +127,20 @@ from tensorflow.python.ops.init_ops import convolutional_orthogonal_3d
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = ['nest', 'broadcast_to']
-
+_nest_allowed_symbols = [
+    'assert_same_structure',
+    'is_sequence',
+    'flatten',
+    'flatten_dict_items',
+    'pack_sequence_as',
+    'map_structure',
+    'assert_shallow_structure',
+    'flatten_up_to',
+    'map_structure_up_to',
+    'get_traverse_shallow_structure',
+    'yield_flat_paths',
+    'flatten_with_joined_string_paths',
+]
+
+remove_undocumented(nest.__name__, allowed_exception_list=_nest_allowed_symbols)
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
index f37a2593e2..c35e60a554 100644
--- a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
+++ b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
@@ -13,7 +13,10 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Apply graph_transforms tool to MetaGraphDefs."""
+"""Apply graph_transforms tool to MetaGraphDefs.
+
+@@meta_graph_transform
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index c1702ae13c..cf707fb2c7 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -120,31 +120,9 @@ from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import sysconfig
 from tensorflow.python.platform import test
 
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.all_util import make_all
 from tensorflow.python.util.tf_export import tf_export
 
-# Import modules whose docstrings contribute, for use by remove_undocumented
-# below.
-from tensorflow.python.client import client_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import framework_lib
-from tensorflow.python.framework import subscribe
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import confusion_matrix as confusion_matrix_m
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import histogram_ops
-from tensorflow.python.ops import io_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import script_ops
-from tensorflow.python.ops import session_ops
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.ops import tensor_array_ops
-
 # Eager execution
 from tensorflow.python.eager.context import executing_eagerly
 from tensorflow.python.framework.ops import enable_eager_execution
@@ -163,35 +141,6 @@ nn.bidirectional_dynamic_rnn = rnn.bidirectional_dynamic_rnn
 nn.static_state_saving_rnn = rnn.static_state_saving_rnn
 nn.rnn_cell = rnn_cell
 
-# Symbols whitelisted for export without documentation.
-# TODO(cwhipkey): review these and move to contrib, expose through
-# documentation, or remove.
-_allowed_symbols = [
-    'AttrValue',
-    'ConfigProto',
-    'ClusterDef',
-    'DeviceSpec',
-    'Event',
-    'GPUOptions',
-    'GRAPH_DEF_VERSION',
-    'GRAPH_DEF_VERSION_MIN_CONSUMER',
-    'GRAPH_DEF_VERSION_MIN_PRODUCER',
-    'GraphDef',
-    'GraphOptions',
-    'HistogramProto',
-    'LogMessage',
-    'MetaGraphDef',
-    'NameAttrList',
-    'NodeDef',
-    'OptimizerOptions',
-    'RunOptions',
-    'RunMetadata',
-    'SessionLog',
-    'Summary',
-    'SummaryMetadata',
-    'TensorInfo',  # Used for tf.saved_model functionality.
-]
-
 # Export protos
 # pylint: disable=undefined-variable
 tf_export('AttrValue')(AttrValue)
@@ -216,121 +165,6 @@ tf_export('summary.TaggedRunMetadata')(TaggedRunMetadata)
 tf_export('TensorInfo')(TensorInfo)
 # pylint: enable=undefined-variable
 
-
-# The following symbols are kept for compatibility. It is our plan
-# to remove them in the future.
-_allowed_symbols.extend([
-    'arg_max',
-    'arg_min',
-    'create_partitioned_variables',
-    'deserialize_many_sparse',
-    'lin_space',
-    'listdiff',  # Use tf.listdiff instead.
-    'parse_single_sequence_example',
-    'serialize_many_sparse',
-    'serialize_sparse',
-    'sparse_matmul',  ## use tf.matmul instead.
-])
-
-# This is needed temporarily because we import it explicitly.
-_allowed_symbols.extend([
-    'pywrap_tensorflow',
-])
-
-# Dtypes exported by framework/dtypes.py.
-# TODO(cwhipkey): expose these through documentation.
-_allowed_symbols.extend([
-    'QUANTIZED_DTYPES',
-    'bfloat16',
-    'bool',
-    'complex64',
-    'complex128',
-    'double',
-    'half',
-    'float16',
-    'float32',
-    'float64',
-    'int16',
-    'int32',
-    'int64',
-    'int8',
-    'qint16',
-    'qint32',
-    'qint8',
-    'quint16',
-    'quint8',
-    'string',
-    'uint64',
-    'uint32',
-    'uint16',
-    'uint8',
-    'resource',
-    'variant',
-])
-
-# Export modules and constants.
-_allowed_symbols.extend([
-    'app',
-    'bitwise',
-    'compat',
-    'data',
-    'distributions',
-    'errors',
-    'estimator',
-    'feature_column',
-    'flags',
-    'gfile',
-    'graph_util',
-    'image',
-    'initializers',
-    'keras',
-    'layers',
-    'linalg',
-    'logging',
-    'losses',
-    'manip',
-    'metrics',
-    'newaxis',
-    'nn',
-    'profiler',
-    'python_io',
-    'resource_loader',
-    'saved_model',
-    'sets',
-    'spectral',
-    'summary',
-    'sysconfig',
-    'test',
-    'train',
-    'user_ops',
-])
-
-# Variables framework.versions:
-_allowed_symbols.extend([
-    'VERSION',
-    'GIT_VERSION',
-    'COMPILER_VERSION',
-    'CXX11_ABI_FLAG',
-    'MONOLITHIC_BUILD',
-])
-
-# Eager execution
-_allowed_symbols.extend([
-    'enable_eager_execution',
-    'executing_eagerly',
-])
-
-# Remove all extra symbols that don't have a docstring or are not explicitly
-# referenced in the whitelist.
-remove_undocumented(__name__, _allowed_symbols, [
-    framework_lib, array_ops, check_ops, client_lib, compat, constant_op,
-    control_flow_ops, confusion_matrix_m, data, distributions,
-    functional_ops, histogram_ops, io_ops, keras, layers,
-    losses, math_ops, metrics, nn, profiler, resource_loader, sets, script_ops,
-    session_ops, sparse_ops, state_ops, string_ops, summary, tensor_array_ops,
-    train
-])
-
 # Special dunders that we choose to export:
 _exported_dunders = set([
     '__version__',
diff --git a/tensorflow/python/data/__init__.py b/tensorflow/python/data/__init__.py
index 239f9b0d59..5cedb89bf8 100644
--- a/tensorflow/python/data/__init__.py
+++ b/tensorflow/python/data/__init__.py
@@ -34,6 +34,3 @@ from tensorflow.python.data.ops.readers import FixedLengthRecordDataset
 from tensorflow.python.data.ops.readers import TextLineDataset
 from tensorflow.python.data.ops.readers import TFRecordDataset
 # pylint: enable=unused-import
-
-from tensorflow.python.util.all_util import remove_undocumented
-remove_undocumented(__name__)
diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py
index e90ce3fb40..eff6e02c14 100644
--- a/tensorflow/python/data/util/nest.py
+++ b/tensorflow/python/data/util/nest.py
@@ -44,7 +44,6 @@ import collections as _collections
 import six as _six
 
 from tensorflow.python.framework import sparse_tensor as _sparse_tensor
-from tensorflow.python.util.all_util import remove_undocumented
 
 
 def _sorted(dict_):
@@ -538,16 +537,3 @@ def map_structure_up_to(shallow_tree, func, *inputs):
   results = [func(*tensors) for tensors in zip(*all_flattened_up_to)]
   return pack_sequence_as(structure=shallow_tree, flat_sequence=results)
 
-
-_allowed_symbols = [
-    "assert_same_structure",
-    "is_sequence",
-    "flatten",
-    "pack_sequence_as",
-    "map_structure",
-    "assert_shallow_structure",
-    "flatten_up_to",
-    "map_structure_up_to",
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/estimator/estimator_lib.py b/tensorflow/python/estimator/estimator_lib.py
index 60c59cbc18..3815f42470 100644
--- a/tensorflow/python/estimator/estimator_lib.py
+++ b/tensorflow/python/estimator/estimator_lib.py
@@ -47,45 +47,4 @@ from tensorflow.python.estimator.training import train_and_evaluate
 from tensorflow.python.estimator.training import TrainSpec
 
 
-from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long,wildcard-import
-
-_allowed_symbols = [
-    # Canned Estimators
-    'BaselineClassifier',
-    'BaselineRegressor',
-    'BoostedTreesClassifier',
-    'BoostedTreesRegressor',
-    'DNNClassifier',
-    'DNNRegressor',
-    'DNNLinearCombinedClassifier',
-    'DNNLinearCombinedRegressor',
-    'LinearClassifier',
-    'LinearRegressor',
-
-    # I/O
-    'classifier_parse_example_spec',
-    'regressor_parse_example_spec',
-    'inputs',
-    'export',
-
-    # Estimator
-    'Estimator',
-    'EstimatorSpec',
-    'ModeKeys',
-    'RunConfig',
-
-    # Training utilities
-    'train_and_evaluate',
-    'EvalSpec',
-    'TrainSpec',
-    'Exporter',
-    'LatestExporter',
-    'FinalExporter',
-
-    # Warm-starting
-    'WarmStartSettings',
-    'VocabInfo',
-]
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/estimator/export/export_lib.py b/tensorflow/python/estimator/export/export_lib.py
index 226fc97fd3..f4ac8581ea 100644
--- a/tensorflow/python/estimator/export/export_lib.py
+++ b/tensorflow/python/estimator/export/export_lib.py
@@ -28,18 +28,5 @@ from tensorflow.python.estimator.export.export_output import ExportOutput
 from tensorflow.python.estimator.export.export_output import PredictOutput
 from tensorflow.python.estimator.export.export_output import RegressionOutput
 
-from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long
 
-_allowed_symbols = [
-    'build_parsing_serving_input_receiver_fn',
-    'build_raw_serving_input_receiver_fn',
-    'ServingInputReceiver',
-    'TensorServingInputReceiver',
-    'ClassificationOutput',
-    'ExportOutput',
-    'PredictOutput',
-    'RegressionOutput',
-]
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/estimator/inputs/inputs.py b/tensorflow/python/estimator/inputs/inputs.py
index 1a1c9a6c3f..6be168ee08 100644
--- a/tensorflow/python/estimator/inputs/inputs.py
+++ b/tensorflow/python/estimator/inputs/inputs.py
@@ -22,12 +22,4 @@ from __future__ import print_function
 from tensorflow.python.estimator.inputs.numpy_io import numpy_input_fn
 from tensorflow.python.estimator.inputs.pandas_io import pandas_input_fn
 
-from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long
-
-_allowed_symbols = [
-    'numpy_input_fn',
-    'pandas_input_fn'
-]
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
index 505a1408d2..3b818f18b5 100644
--- a/tensorflow/python/feature_column/feature_column_lib.py
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -20,25 +20,4 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import
 from tensorflow.python.feature_column.feature_column import *
-
-from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long
-
-_allowed_symbols = [
-    'input_layer',
-    'linear_model',
-    'make_parse_example_spec',
-    'embedding_column',
-    'shared_embedding_columns',
-    'crossed_column',
-    'numeric_column',
-    'bucketized_column',
-    'categorical_column_with_hash_bucket',
-    'categorical_column_with_vocabulary_file',
-    'categorical_column_with_vocabulary_list',
-    'categorical_column_with_identity',
-    'weighted_categorical_column',
-    'indicator_column',
-]
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/framework/errors.py b/tensorflow/python/framework/errors.py
index c8cf9ae39b..be0187c2ef 100644
--- a/tensorflow/python/framework/errors.py
+++ b/tensorflow/python/framework/errors.py
@@ -25,50 +25,4 @@ from tensorflow.python.framework import errors_impl as _impl
 # pylint: disable=wildcard-import
 from tensorflow.python.framework.errors_impl import *
 # pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
 
-# These are referenced in client/client_lib.py.
-# Unfortunately, we can't import client_lib to examine
-# the references, since it would create a dependency cycle.
-_allowed_symbols = [
-    "AbortedError",
-    "AlreadyExistsError",
-    "CancelledError",
-    "DataLossError",
-    "DeadlineExceededError",
-    "FailedPreconditionError",
-    "InternalError",
-    "InvalidArgumentError",
-    "NotFoundError",
-    "OpError",
-    "OutOfRangeError",
-    "PermissionDeniedError",
-    "ResourceExhaustedError",
-    "UnauthenticatedError",
-    "UnavailableError",
-    "UnimplementedError",
-    "UnknownError",
-    "error_code_from_exception_type",
-    "exception_type_from_error_code",
-    "raise_exception_on_not_ok_status",
-    # Scalars that have no docstrings:
-    "OK",
-    "CANCELLED",
-    "UNKNOWN",
-    "INVALID_ARGUMENT",
-    "DEADLINE_EXCEEDED",
-    "NOT_FOUND",
-    "ALREADY_EXISTS",
-    "PERMISSION_DENIED",
-    "UNAUTHENTICATED",
-    "RESOURCE_EXHAUSTED",
-    "FAILED_PRECONDITION",
-    "ABORTED",
-    "OUT_OF_RANGE",
-    "UNIMPLEMENTED",
-    "INTERNAL",
-    "UNAVAILABLE",
-    "DATA_LOSS",
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/framework/graph_util.py b/tensorflow/python/framework/graph_util.py
index a666630e44..c5cc110734 100644
--- a/tensorflow/python/framework/graph_util.py
+++ b/tensorflow/python/framework/graph_util.py
@@ -28,14 +28,3 @@ from tensorflow.python.framework.graph_util_impl import must_run_on_cpu
 from tensorflow.python.framework.graph_util_impl import remove_training_nodes
 from tensorflow.python.framework.graph_util_impl import tensor_shape_from_node_def_name
 # pylint: enable=unused-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
-    # TODO(drpng): find a good place to reference this.
-    "convert_variables_to_constants",
-    "extract_sub_graph",
-    "must_run_on_cpu",
-    "tensor_shape_from_node_def_name",
-    "remove_training_nodes",
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/layers/layers.py b/tensorflow/python/layers/layers.py
index 13a8e8e39c..c5fa0d3aba 100644
--- a/tensorflow/python/layers/layers.py
+++ b/tensorflow/python/layers/layers.py
@@ -61,8 +61,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.util.all_util import remove_undocumented
-
 # pylint: disable=g-bad-import-order,unused-import
 
 # Base objects.
@@ -122,7 +120,3 @@ from tensorflow.python.layers.normalization import BatchNormalization
 from tensorflow.python.layers.normalization import batch_normalization
 
 # pylint: enable=g-bad-import-order,unused-import
-
-_allowed_symbols = []
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/lib/io/python_io.py b/tensorflow/python/lib/io/python_io.py
index b92cfe8f80..d4bc8afd1e 100644
--- a/tensorflow/python/lib/io/python_io.py
+++ b/tensorflow/python/lib/io/python_io.py
@@ -31,8 +31,3 @@ from __future__ import print_function
 # pylint: disable=wildcard-import
 from tensorflow.python.lib.io.tf_record import *
 # pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = []
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/bitwise_ops.py b/tensorflow/python/ops/bitwise_ops.py
index e8e187e68f..123380cf04 100644
--- a/tensorflow/python/ops/bitwise_ops.py
+++ b/tensorflow/python/ops/bitwise_ops.py
@@ -32,7 +32,6 @@ from tensorflow.python.framework import ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_bitwise_ops import *
 # pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
 
 ops.NotDifferentiable("BitwiseAnd")
 ops.NotDifferentiable("BitwiseOr")
@@ -41,5 +40,3 @@ ops.NotDifferentiable("Invert")
 ops.NotDifferentiable("PopulationCount")
 ops.NotDifferentiable("LeftShift")
 ops.NotDifferentiable("RightShift")
-
-remove_undocumented(__name__)
diff --git a/tensorflow/python/ops/distributions/bijector.py b/tensorflow/python/ops/distributions/bijector.py
index 84bd0a20da..94a77a205a 100644
--- a/tensorflow/python/ops/distributions/bijector.py
+++ b/tensorflow/python/ops/distributions/bijector.py
@@ -23,8 +23,3 @@ from __future__ import print_function
 from tensorflow.python.ops.distributions.bijector_impl import Bijector
 
 # pylint: enable=wildcard-import,unused-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = ["Bijector"]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/distributions/distributions.py b/tensorflow/python/ops/distributions/distributions.py
index 7c4b8697d8..59ed455e43 100644
--- a/tensorflow/python/ops/distributions/distributions.py
+++ b/tensorflow/python/ops/distributions/distributions.py
@@ -35,29 +35,3 @@ from tensorflow.python.ops.distributions.student_t import StudentT
 from tensorflow.python.ops.distributions.uniform import Uniform
 # pylint: enable=wildcard-import,unused-import
 
-from tensorflow.python.util.all_util import remove_undocumented
-
-
-_allowed_symbols = [
-    "Bernoulli",
-    "Beta",
-    "Categorical",
-    "DirichletMultinomial",
-    "Dirichlet",
-    "Distribution",
-    "ReparameterizationType",
-    "FULLY_REPARAMETERIZED",
-    "NOT_REPARAMETERIZED",
-    "Exponential",
-    "Gamma",
-    "RegisterKL",
-    "kl_divergence",
-    "Laplace",
-    "Multinomial",
-    "Normal",
-    "StudentT",
-    "Uniform",
-]
-
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/distributions/transformed_distribution.py b/tensorflow/python/ops/distributions/transformed_distribution.py
index 6aa6ec40d9..bc321900dc 100644
--- a/tensorflow/python/ops/distributions/transformed_distribution.py
+++ b/tensorflow/python/ops/distributions/transformed_distribution.py
@@ -19,8 +19,6 @@ from __future__ import print_function
 
 import numpy as np
 
-# Bijectors must be directly imported because `remove_undocumented` prevents
-# individual file imports.
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/python/ops/gradients.py b/tensorflow/python/ops/gradients.py
index 2668e8f60c..9fa8e27d5c 100644
--- a/tensorflow/python/ops/gradients.py
+++ b/tensorflow/python/ops/gradients.py
@@ -25,14 +25,4 @@ from tensorflow.python.ops.gradients_impl import AggregationMethod
 from tensorflow.python.ops.gradients_impl import gradients
 from tensorflow.python.ops.gradients_impl import hessians
 # pylint: enable=unused-import
-from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = [
-    # TODO(drpng): find a good place to reference this.
-    "AggregationMethod",
-    "GradientTape",
-    "custom_gradient",
-    "gradients",  # tf.gradients.gradients.
-    "hessians",  # tf.gradients.hessians
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index 68be9ccdd6..3d40c39181 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -91,13 +91,3 @@ from tensorflow.python.ops.image_ops_impl import *
 from tensorflow.python.ops.image_ops_impl import _Check3DImage
 from tensorflow.python.ops.image_ops_impl import _ImageDimensions
 # pylint: enable=unused-import
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
-    # ResizeMethod is not documented, but is documented in functions
-    # that use it.
-    'ResizeMethod',
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/losses/losses.py b/tensorflow/python/ops/losses/losses.py
index 8532c19ad6..81ee01a41a 100644
--- a/tensorflow/python/ops/losses/losses.py
+++ b/tensorflow/python/ops/losses/losses.py
@@ -35,16 +35,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-from tensorflow.python.ops.losses import util
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.losses.losses_impl import *
 from tensorflow.python.ops.losses.util import *
 # pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = []
-
-remove_undocumented(__name__, _allowed_symbols,
-                    [sys.modules[__name__], util])
diff --git a/tensorflow/python/ops/manip_ops.py b/tensorflow/python/ops/manip_ops.py
index 6d335cdc21..373585395b 100644
--- a/tensorflow/python/ops/manip_ops.py
+++ b/tensorflow/python/ops/manip_ops.py
@@ -22,7 +22,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.ops import gen_manip_ops as _gen_manip_ops
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -34,7 +33,3 @@ def roll(input, shift, axis):  # pylint: disable=redefined-builtin
 
 roll.__doc__ = _gen_manip_ops.roll.__doc__
 # pylint: enable=protected-access
-
-_allowed_symbols = ['roll']
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/ops/metrics.py b/tensorflow/python/ops/metrics.py
index 7e75542aec..d1a8249154 100644
--- a/tensorflow/python/ops/metrics.py
+++ b/tensorflow/python/ops/metrics.py
@@ -58,8 +58,3 @@ from __future__ import print_function
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.metrics_impl import *
 # pylint: enable=wildcard-import
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = []
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index 1d0d9a52a1..25e4add569 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -117,7 +117,6 @@ from tensorflow.python.ops import nn_ops as _nn_ops
 from tensorflow.python.ops.math_ops import sigmoid
 from tensorflow.python.ops.math_ops import tanh
 # pylint: enable=unused-import
-from tensorflow.python.util.all_util import remove_undocumented
 
 # Bring more nn-associated functionality into this package.
 # go/tf-wildcard-import
@@ -128,22 +127,3 @@ from tensorflow.python.ops.nn_ops import *
 from tensorflow.python.ops.candidate_sampling_ops import *
 from tensorflow.python.ops.embedding_ops import *
 # pylint: enable=wildcard-import,unused-import
-
-
-# TODO(cwhipkey): sigmoid and tanh should not be exposed from tf.nn.
-_allowed_symbols = [
-    "zero_fraction",  # documented in training.py
-    # Modules whitelisted for reference through tf.nn.
-    # TODO(cwhipkey): migrate callers to use the submodule directly.
-    # Symbols whitelisted for export without documentation.
-    # TODO(cwhipkey): review these and move to contrib or expose through
-    # documentation.
-    "all_candidate_sampler",  # Excluded in gen_docs_combined.
-    "lrn",  # Excluded in gen_docs_combined.
-    "relu_layer",  # Excluded in gen_docs_combined.
-    "xw_plus_b",  # Excluded in gen_docs_combined.
-    "rnn_cell",  # rnn_cell is a submodule of tf.nn.
-]
-
-remove_undocumented(__name__, _allowed_symbols,
-                    [_sys.modules[__name__], _ctc_ops, _nn_ops, _nn_grad])
diff --git a/tensorflow/python/ops/rnn_cell.py b/tensorflow/python/ops/rnn_cell.py
index c0dac8fb01..3d26ffb7ae 100644
--- a/tensorflow/python/ops/rnn_cell.py
+++ b/tensorflow/python/ops/rnn_cell.py
@@ -44,8 +44,3 @@ from __future__ import print_function
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.rnn_cell_impl import *
 # pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = []
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/sdca_ops.py b/tensorflow/python/ops/sdca_ops.py
index 8b7e5abbc2..24ea68892a 100644
--- a/tensorflow/python/ops/sdca_ops.py
+++ b/tensorflow/python/ops/sdca_ops.py
@@ -31,11 +31,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops.gen_sdca_ops import *
 # pylint: enable=wildcard-import
 
-from tensorflow.python.util.all_util import remove_undocumented
-
 ops.NotDifferentiable("SdcaFprint")
 ops.NotDifferentiable("SdcaOptimizer")
 ops.NotDifferentiable("SdcaShrinkL1")
-
-
-remove_undocumented(__name__)
diff --git a/tensorflow/python/ops/sets.py b/tensorflow/python/ops/sets.py
index ea4677befe..54d6e1db41 100644
--- a/tensorflow/python/ops/sets.py
+++ b/tensorflow/python/ops/sets.py
@@ -28,8 +28,3 @@ from __future__ import print_function
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.sets_impl import *
 # pylint: enable=wildcard-import
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = []
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/spectral_ops.py b/tensorflow/python/ops/spectral_ops.py
index a579688276..4a4ca693dc 100644
--- a/tensorflow/python/ops/spectral_ops.py
+++ b/tensorflow/python/ops/spectral_ops.py
@@ -40,7 +40,6 @@ from tensorflow.python.framework import tensor_util as _tensor_util
 from tensorflow.python.ops import array_ops as _array_ops
 from tensorflow.python.ops import gen_spectral_ops
 from tensorflow.python.ops import math_ops as _math_ops
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -249,5 +248,3 @@ def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disabl
       dct2 *= weights
 
     return dct2
-
-remove_undocumented(__name__)
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index f71f98aa12..a2d24711e2 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -33,7 +33,6 @@ from tensorflow.python.ops import sparse_grad
 from tensorflow.python.ops import spectral_grad
 from tensorflow.python.ops import state_grad
 from tensorflow.python.ops import tensor_array_grad
-from tensorflow.python.util.all_util import remove_undocumented
 
 
 # go/tf-wildcard-import
@@ -99,212 +98,3 @@ from tensorflow.python.ops.variables import *
 # pylint: enable=wildcard-import
 # pylint: enable=g-bad-import-order
 
-#### For use in remove_undocumented below:
-from tensorflow.python.framework import constant_op as _constant_op
-from tensorflow.python.ops import array_ops as _array_ops
-from tensorflow.python.ops import check_ops as _check_ops
-from tensorflow.python.ops import clip_ops as _clip_ops
-from tensorflow.python.ops import confusion_matrix as _confusion_matrix
-from tensorflow.python.ops import control_flow_ops as _control_flow_ops
-from tensorflow.python.ops import data_flow_ops as _data_flow_ops
-from tensorflow.python.ops import functional_ops as _functional_ops
-from tensorflow.python.ops import gradients as _gradients
-from tensorflow.python.ops import histogram_ops as _histogram_ops
-from tensorflow.python.ops import init_ops as _init_ops
-from tensorflow.python.ops import io_ops as _io_ops
-from tensorflow.python.ops import linalg_ops as _linalg_ops
-from tensorflow.python.ops import logging_ops as _logging_ops
-from tensorflow.python.ops import manip_ops as _manip_ops
-from tensorflow.python.ops import math_ops as _math_ops
-from tensorflow.python.ops import numerics as _numerics
-from tensorflow.python.ops import parsing_ops as _parsing_ops
-from tensorflow.python.ops import partitioned_variables as _partitioned_variables
-from tensorflow.python.ops import random_ops as _random_ops
-from tensorflow.python.ops import script_ops as _script_ops
-from tensorflow.python.ops import session_ops as _session_ops
-from tensorflow.python.ops import sparse_ops as _sparse_ops
-from tensorflow.python.ops import special_math_ops as _special_math_ops
-from tensorflow.python.ops import state_ops as _state_ops
-from tensorflow.python.ops import string_ops as _string_ops
-from tensorflow.python.ops import template as _template
-from tensorflow.python.ops import tensor_array_ops as _tensor_array_ops
-from tensorflow.python.ops import variable_scope as _variable_scope
-from tensorflow.python.ops import variables as _variables
-
-
-_allowed_symbols_math_ops = [
-    # TODO(drpng): decide if we want to reference these in the documentation.
-    "reduced_shape",
-    "sparse_segment_mean_grad",
-    "sparse_segment_sqrt_n_grad",
-
-    # Legacy: will be removed.
-    "arg_max",
-    "arg_min",
-    "lin_space",
-    "sparse_matmul",  # Use tf.matmul.
-    # Deprecated (see versions.h):
-    "batch_fft",
-    "batch_fft2d",
-    "batch_fft3d",
-    "batch_ifft",
-    "batch_ifft2d",
-    "batch_ifft3d",
-    "mul",  # use tf.multiply instead.
-    "neg",  # use tf.negative instead.
-    "sub",  # use tf.subtract instead.
-
-    # These are documented in nn.
-    # We are not importing nn because it would create a circular dependency.
-    "sigmoid",
-    "log_sigmoid",
-    "tanh",
-]
-
-_allowed_symbols_array_ops = [
-    # TODO(drpng): make sure they are documented.
-    # Scalars:
-    "NEW_AXIS",
-    "SHRINK_AXIS",
-    "newaxis",
-
-    # Documented in training.py.
-    # I do not import train, to avoid circular dependencies.
-    # TODO(drpng): this is defined in gen_array_ops, clearly not the right
-    # place.
-    "stop_gradient",
-
-    # See gen_docs_combined for tf.copy documentation.
-    "copy",
-
-    ## TODO(drpng): make them inaccessible directly.
-    ## TODO(drpng): Below, to-doc means that we need to find an appropriate
-    ##  documentation section to reference.
-    ## For re-exporting to tf.*:
-    "constant",
-    "edit_distance",  # to-doc
-    # From gen_array_ops:
-    "copy_host",  # to-doc
-    "immutable_const",  # to-doc
-    "invert_permutation",  # to-doc
-    "quantize_and_dequantize",  # to-doc
-
-    # TODO(drpng): legacy symbols to be removed.
-    "batch_matrix_diag",
-    "batch_matrix_band_part",
-    "batch_matrix_diag_part",
-    "batch_matrix_set_diag",
-]
-
-_allowed_symbols_partitioned_variables = [
-    "PartitionedVariable",   # Requires doc link.
-    # Legacy.
-    "create_partitioned_variables",
-    "variable_axis_size_partitioner",
-    "min_max_variable_partitioner",
-    "fixed_size_partitioner",
-]
-
-_allowed_symbols_control_flow_ops = [
-    # TODO(drpng): Find a place in the documentation to reference these or
-    # remove.
-    "control_trigger",
-    "loop_cond",
-    "merge",
-    "switch",
-]
-
-_allowed_symbols_functional_ops = [
-    "nest",  # Used by legacy code.
-]
-
-_allowed_symbols_gradients = [
-    # Documented in training.py:
-    # Not importing training.py to avoid complex graph dependencies.
-    "AggregationMethod",
-    "GradientTape",
-    "custom_gradient",
-    "gradients",  # tf.gradients = gradients.gradients
-    "hessians",
-]
-
-_allowed_symbols_clip_ops = [
-    # Documented in training.py:
-    # Not importing training.py to avoid complex graph dependencies.
-    "clip_by_average_norm",
-    "clip_by_global_norm",
-    "clip_by_norm",
-    "clip_by_value",
-    "global_norm",
-]
-
-_allowed_symbols_logging_ops = [
-    # Documented in training.py.
-    # We are not importing training.py to avoid complex dependencies.
-    "audio_summary",
-    "histogram_summary",
-    "image_summary",
-    "merge_all_summaries",
-    "merge_summary",
-    "scalar_summary",
-
-    # TODO(drpng): link in training.py if it should be documented.
-    "get_summary_op",
-]
-
-_allowed_symbols_variable_scope_ops = [
-    "get_local_variable",  # Documented in framework package.
-]
-
-_allowed_symbols_misc = [
-    "deserialize_many_sparse",
-    "parse_single_sequence_example",
-    "serialize_many_sparse",
-    "serialize_sparse",
-    "confusion_matrix",
-]
-
-_allowed_symbols = (_allowed_symbols_array_ops +
-                    _allowed_symbols_clip_ops +
-                    _allowed_symbols_control_flow_ops +
-                    _allowed_symbols_functional_ops +
-                    _allowed_symbols_gradients +
-                    _allowed_symbols_logging_ops +
-                    _allowed_symbols_math_ops +
-                    _allowed_symbols_variable_scope_ops +
-                    _allowed_symbols_misc +
-                    _allowed_symbols_partitioned_variables)
-
-remove_undocumented(__name__, _allowed_symbols, [
-    _sys.modules[__name__],
-    _array_ops,
-    _check_ops,
-    _clip_ops,
-    _confusion_matrix,
-    _control_flow_ops,
-    _constant_op,
-    _data_flow_ops,
-    _functional_ops,
-    _gradients,
-    _histogram_ops,
-    _init_ops,
-    _io_ops,
-    _linalg_ops,
-    _logging_ops,
-    _manip_ops,
-    _math_ops,
-    _numerics,
-    _parsing_ops,
-    _partitioned_variables,
-    _random_ops,
-    _script_ops,
-    _session_ops,
-    _sparse_ops,
-    _special_math_ops,
-    _state_ops,
-    _string_ops,
-    _template,
-    _tensor_array_ops,
-    _variable_scope,
-    _variables,
-])
diff --git a/tensorflow/python/platform/app.py b/tensorflow/python/platform/app.py
index cce64c0cca..4c91bc3652 100644
--- a/tensorflow/python/platform/app.py
+++ b/tensorflow/python/platform/app.py
@@ -22,7 +22,6 @@ import errno as _errno
 import sys as _sys
 
 from tensorflow.python.platform import flags
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -125,11 +124,3 @@ def run(main=None, argv=None):
   # to the final program.
   _sys.exit(main(argv))
 
-
-_allowed_symbols = [
-    'run',
-    # Allowed submodule.
-    'flags',
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/platform/gfile.py b/tensorflow/python/platform/gfile.py
index 315889e9aa..fd697d70bf 100644
--- a/tensorflow/python/platform/gfile.py
+++ b/tensorflow/python/platform/gfile.py
@@ -33,7 +33,6 @@ from tensorflow.python.lib.io.file_io import rename as Rename
 from tensorflow.python.lib.io.file_io import stat as Stat
 from tensorflow.python.lib.io.file_io import walk as Walk
 # pylint: enable=unused-import
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -56,24 +55,3 @@ class FastGFile(_FileIO):
 # Does not alias to Open so that we use our version of GFile to strip
 # 'b' mode.
 Open = GFile
-
-# TODO(drpng): Find the right place to document these.
-_allowed_symbols = [
-    'Copy',
-    'DeleteRecursively',
-    'Exists',
-    'FastGFile',
-    'GFile',
-    'Glob',
-    'IsDirectory',
-    'ListDirectory',
-    'Open',
-    'MakeDirs',
-    'MkDir',
-    'Remove',
-    'Rename',
-    'Stat',
-    'Walk',
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/platform/resource_loader.py b/tensorflow/python/platform/resource_loader.py
index 8f7b12e2b2..650a1fd851 100644
--- a/tensorflow/python/platform/resource_loader.py
+++ b/tensorflow/python/platform/resource_loader.py
@@ -28,7 +28,6 @@ import os as _os
 import sys as _sys
 
 from tensorflow.python.util import tf_inspect as _inspect
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -129,7 +128,3 @@ def get_path_to_datafile(path):
 def readahead_file_path(path, readahead='128M'):  # pylint: disable=unused-argument
   """Readahead files not implemented; simply returns given path."""
   return path
-
-
-_allowed_symbols = []
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/platform/sysconfig.py b/tensorflow/python/platform/sysconfig.py
index fdd2b903fc..56759d1b8e 100644
--- a/tensorflow/python/platform/sysconfig.py
+++ b/tensorflow/python/platform/sysconfig.py
@@ -28,7 +28,6 @@ import os.path as _os_path
 
 from tensorflow.python.framework.versions import CXX11_ABI_FLAG as _CXX11_ABI_FLAG
 from tensorflow.python.framework.versions import MONOLITHIC_BUILD as _MONOLITHIC_BUILD
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -84,6 +83,3 @@ def get_link_flags():
     flags.append('-L%s' % get_lib())
     flags.append('-ltensorflow_framework')
   return flags
-
-_allowed_symbols = []
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py
index 1660791feb..0a0fe68be5 100644
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@@ -42,7 +42,6 @@ from __future__ import print_function
 # pylint: disable=g-bad-import-order
 from tensorflow.python.framework import test_util as _test_util
 from tensorflow.python.platform import googletest as _googletest
-from tensorflow.python.util.all_util import remove_undocumented
 
 # pylint: disable=unused-import
 from tensorflow.python.framework.test_util import assert_equal_graph_def
@@ -108,13 +107,3 @@ def test_src_dir_path(relative_path):
 def is_built_with_cuda():
   """Returns whether TensorFlow was built with CUDA (GPU) support."""
   return _test_util.IsGoogleCudaEnabled()
-
-
-_allowed_symbols = [
-    # We piggy-back googletest documentation.
-    'Benchmark',
-    'mock',
-    'StubOutForTesting',
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py
index 22aabfd712..5962d2f220 100644
--- a/tensorflow/python/platform/tf_logging.py
+++ b/tensorflow/python/platform/tf_logging.py
@@ -34,7 +34,6 @@ import threading
 
 import six
 
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -287,35 +286,8 @@ def _get_thread_id():
 
 _log_prefix = google2_log_prefix
 
-# Controls which methods from pyglib.logging are available within the project.
-# Do not add methods here without also adding to platform/tf_logging.py.
-_allowed_symbols = [
-    'DEBUG',
-    'ERROR',
-    'FATAL',
-    'INFO',
-    'TaskLevelStatusMessage',
-    'WARN',
-    'debug',
-    'error',
-    'fatal',
-    'flush',
-    'get_verbosity',
-    'info',
-    'log',
-    'log_if',
-    'log_every_n',
-    'log_first_n',
-    'set_verbosity',
-    'vlog',
-    'warn',
-    'warning',
-]
-
 tf_export('logging.DEBUG').export_constant(__name__, 'DEBUG')
 tf_export('logging.ERROR').export_constant(__name__, 'ERROR')
 tf_export('logging.FATAL').export_constant(__name__, 'FATAL')
 tf_export('logging.INFO').export_constant(__name__, 'INFO')
 tf_export('logging.WARN').export_constant(__name__, 'WARN')
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/profiler/profiler.py b/tensorflow/python/profiler/profiler.py
index fa7f30b236..efbdd1ba68 100644
--- a/tensorflow/python/profiler/profiler.py
+++ b/tensorflow/python/profiler/profiler.py
@@ -30,7 +30,6 @@ from tensorflow.python.profiler.model_analyzer import Profiler
 from tensorflow.python.profiler.option_builder import ProfileOptionBuilder
 from tensorflow.python.profiler.tfprof_logger import write_op_log
 
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -54,11 +53,3 @@ tf_export('profiler.GraphNodeProto')(GraphNodeProto)
 tf_export('profiler.MultiGraphNodeProto')(MultiGraphNodeProto)
 tf_export('profiler.AdviceProto')(AdviceProto)
 tf_export('profiler.OpLogProto')(OpLogProto)
-
-remove_undocumented(__name__, _allowed_symbols, [
-    Profiler,
-    profile,
-    ProfileOptionBuilder,
-    advise,
-    write_op_log,
-])
diff --git a/tensorflow/python/saved_model/builder.py b/tensorflow/python/saved_model/builder.py
index 766b0a3579..be49c70c60 100644
--- a/tensorflow/python/saved_model/builder.py
+++ b/tensorflow/python/saved_model/builder.py
@@ -26,10 +26,3 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.python.saved_model.builder_impl import SavedModelBuilder
 # pylint: enable=unused-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-
-_allowed_symbols = [
-    "SavedModelBuilder",
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/constants.py b/tensorflow/python/saved_model/constants.py
index ec49a0539f..34206c6f6d 100644
--- a/tensorflow/python/saved_model/constants.py
+++ b/tensorflow/python/saved_model/constants.py
@@ -19,7 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 # Subdirectory name containing the asset files.
@@ -66,17 +65,3 @@ tf_export("saved_model.constants.VARIABLES_DIRECTORY").export_constant(
 VARIABLES_FILENAME = "variables"
 tf_export("saved_model.constants.VARIABLES_FILENAME").export_constant(
     __name__, "VARIABLES_FILENAME")
-
-
-_allowed_symbols = [
-    "ASSETS_DIRECTORY",
-    "ASSETS_KEY",
-    "LEGACY_INIT_OP_KEY",
-    "MAIN_OP_KEY",
-    "SAVED_MODEL_SCHEMA_VERSION",
-    "SAVED_MODEL_FILENAME_PB",
-    "SAVED_MODEL_FILENAME_PBTXT",
-    "VARIABLES_DIRECTORY",
-    "VARIABLES_FILENAME",
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/loader.py b/tensorflow/python/saved_model/loader.py
index 0a7f516287..334298c232 100644
--- a/tensorflow/python/saved_model/loader.py
+++ b/tensorflow/python/saved_model/loader.py
@@ -67,11 +67,3 @@ from __future__ import print_function
 from tensorflow.python.saved_model.loader_impl import load
 from tensorflow.python.saved_model.loader_impl import maybe_saved_model_directory
 # pylint: enable=unused-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-
-_allowed_symbols = [
-    "load",
-    "maybe_saved_model_directory",
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/main_op.py b/tensorflow/python/saved_model/main_op.py
index 04cadeab66..18d11b900c 100644
--- a/tensorflow/python/saved_model/main_op.py
+++ b/tensorflow/python/saved_model/main_op.py
@@ -26,10 +26,3 @@ from __future__ import print_function
 from tensorflow.python.saved_model.main_op_impl import main_op
 from tensorflow.python.saved_model.main_op_impl import main_op_with_restore
 # pylint: enable=unused-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
-    "main_op",
-    "main_op_with_restore",
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/saved_model.py b/tensorflow/python/saved_model/saved_model.py
index caabd7bc30..6702c99607 100644
--- a/tensorflow/python/saved_model/saved_model.py
+++ b/tensorflow/python/saved_model/saved_model.py
@@ -34,18 +34,3 @@ from tensorflow.python.saved_model import utils
 from tensorflow.python.saved_model.simple_save import *
 # pylint: enable=wildcard-import
 
-from tensorflow.python.util.all_util import remove_undocumented
-
-
-_allowed_symbols = [
-    "builder",
-    "constants",
-    "loader",
-    "main_op",
-    "signature_constants",
-    "signature_def_utils",
-    "simple_save",
-    "tag_constants",
-    "utils",
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/signature_constants.py b/tensorflow/python/saved_model/signature_constants.py
index 6461fe8a7e..819f351291 100644
--- a/tensorflow/python/saved_model/signature_constants.py
+++ b/tensorflow/python/saved_model/signature_constants.py
@@ -19,7 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -95,19 +94,3 @@ tf_export("saved_model.signature_constants.REGRESS_OUTPUTS").export_constant(
     __name__, "REGRESS_OUTPUTS")
 
 ################################################################################
-
-
-_allowed_symbols = [
-    "DEFAULT_SERVING_SIGNATURE_DEF_KEY",
-    "CLASSIFY_INPUTS",
-    "CLASSIFY_METHOD_NAME",
-    "CLASSIFY_OUTPUT_CLASSES",
-    "CLASSIFY_OUTPUT_SCORES",
-    "PREDICT_INPUTS",
-    "PREDICT_METHOD_NAME",
-    "PREDICT_OUTPUTS",
-    "REGRESS_INPUTS",
-    "REGRESS_METHOD_NAME",
-    "REGRESS_OUTPUTS",
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/tag_constants.py b/tensorflow/python/saved_model/tag_constants.py
index d164e2c23f..5a797da791 100644
--- a/tensorflow/python/saved_model/tag_constants.py
+++ b/tensorflow/python/saved_model/tag_constants.py
@@ -19,7 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -40,11 +39,3 @@ tf_export("saved_model.tag_constants.GPU").export_constant(__name__, "GPU")
 # Tag for the `tpu` graph.
 TPU = "tpu"
 tf_export("saved_model.tag_constants.TPU").export_constant(__name__, "TPU")
-
-_allowed_symbols = [
-    "SERVING",
-    "TRAINING",
-    "GPU",
-    "TPU"
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/utils.py b/tensorflow/python/saved_model/utils.py
index 8e750d8708..27c3554909 100644
--- a/tensorflow/python/saved_model/utils.py
+++ b/tensorflow/python/saved_model/utils.py
@@ -24,7 +24,3 @@ from __future__ import print_function
 from tensorflow.python.saved_model.utils_impl import build_tensor_info
 from tensorflow.python.saved_model.utils_impl import get_tensor_from_tensor_info
 # pylint: enable=unused-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = ["build_tensor_info", "get_tensor_from_tensor_info"]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index 1286ed6703..969cbe7d35 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -74,7 +74,6 @@ from tensorflow.python.summary.writer.writer_cache import FileWriterCache
 # pylint: enable=unused-import
 
 from tensorflow.python.util import compat as _compat
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -361,10 +360,3 @@ def get_summary_description(node_def):
   summary_description = SummaryDescription()
   _json_format.Parse(description_str, summary_description)
   return summary_description
-
-
-_allowed_symbols = [
-    'Summary', 'SummaryDescription', 'Event', 'TaggedRunMetadata', 'SessionLog',
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/training/queue_runner.py b/tensorflow/python/training/queue_runner.py
index 42559d1e62..92207d97cd 100644
--- a/tensorflow/python/training/queue_runner.py
+++ b/tensorflow/python/training/queue_runner.py
@@ -22,13 +22,3 @@ from __future__ import print_function
 # pylint: disable=wildcard-import
 from tensorflow.python.training.queue_runner_impl import *
 # pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-
-_allowed_symbols = [
-    # Documented in training.py:
-    "QueueRunner",
-    "add_queue_runner",
-    "start_queue_runners",
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index d7e5078be7..4ae7f84510 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -105,13 +105,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys as _sys
-
-from tensorflow.python.ops import io_ops as _io_ops
-from tensorflow.python.ops import sdca_ops as _sdca_ops
-from tensorflow.python.ops import state_ops as _state_ops
-from tensorflow.python.util.all_util import remove_undocumented
-
 # pylint: disable=g-bad-import-order,unused-import
 from tensorflow.python.ops.sdca_ops import sdca_optimizer
 from tensorflow.python.ops.sdca_ops import sdca_fprint
@@ -215,39 +208,6 @@ from tensorflow.core.protobuf.tensorflow_server_pb2 import ServerDef
 from tensorflow.python.training.server_lib import ClusterSpec
 from tensorflow.python.training.server_lib import Server
 
-# Symbols whitelisted for export without documentation.
-_allowed_symbols = [
-    # TODO(cwhipkey): review these and move to contrib or expose through
-    # documentation.
-    "generate_checkpoint_state_proto",  # Used internally by saver.
-    "checkpoint_exists",  # Only used in test?
-    "get_checkpoint_mtimes",  # Only used in test?
-
-    # Legacy: remove.
-    "do_quantize_training_on_graphdef",  # At least use grah_def, not graphdef.
-    # No uses within tensorflow.
-    "queue_runner",  # Use tf.train.start_queue_runner etc directly.
-    # This is also imported internally.
-
-    # TODO(drpng): document these. The reference in howtos/distributed does
-    # not link.
-    "SyncReplicasOptimizer",
-    # Protobufs:
-    "BytesList",  # from example_pb2.
-    "ClusterDef",
-    "Example",  # from example_pb2
-    "Feature",  # from example_pb2
-    "Features",  # from example_pb2
-    "FeatureList",  # from example_pb2
-    "FeatureLists",  # from example_pb2
-    "FloatList",  # from example_pb2.
-    "Int64List",  # from example_pb2.
-    "JobDef",
-    "SaverDef",  # From saver_pb2.
-    "SequenceExample",  # from example_pb2.
-    "ServerDef",
-]
-
 # pylint: disable=undefined-variable
 tf_export("train.BytesList")(BytesList)
 tf_export("train.ClusterDef")(ClusterDef)
@@ -263,9 +223,3 @@ tf_export("train.SaverDef")(SaverDef)
 tf_export("train.SequenceExample")(SequenceExample)
 tf_export("train.ServerDef")(ServerDef)
 # pylint: enable=undefined-variable
-
-# Include extra modules for docstrings because:
-# * Input methods in tf.train are documented in io_ops.
-# * Saver methods in tf.train are documented in state_ops.
-remove_undocumented(__name__, _allowed_symbols,
-                    [_sys.modules[__name__], _io_ops, _sdca_ops, _state_ops])
diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 3358ffe526..1aba7584d1 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -40,7 +40,6 @@ import numbers as _numbers
 import numpy as _np
 import six as _six
 
-from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -142,13 +141,3 @@ tf_export('compat.complex_types').export_constant(__name__, 'complex_types')
 bytes_or_text_types = (bytes, _six.text_type)
 tf_export('compat.bytes_or_text_types').export_constant(__name__,
                                                         'bytes_or_text_types')
-
-_allowed_symbols = [
-    'as_str',
-    'bytes_or_text_types',
-    'complex_types',
-    'integral_types',
-    'real_types',
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 5622431bc9..1104768ae8 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -36,7 +36,6 @@ import collections as _collections
 import six as _six
 
 from tensorflow.python import pywrap_tensorflow as _pywrap_tensorflow
-from tensorflow.python.util.all_util import remove_undocumented
 
 
 def _sorted(dict_):
@@ -758,21 +757,3 @@ def flatten_with_joined_string_paths(structure, separator="/"):
 
 
 _pywrap_tensorflow.RegisterSequenceClass(_collections.Sequence)
-
-
-_allowed_symbols = [
-    "assert_same_structure",
-    "is_sequence",
-    "flatten",
-    "flatten_dict_items",
-    "pack_sequence_as",
-    "map_structure",
-    "assert_shallow_structure",
-    "flatten_up_to",
-    "map_structure_up_to",
-    "get_traverse_shallow_structure",
-    "yield_flat_paths",
-    "flatten_with_joined_string_paths",
-]
-
-remove_undocumented(__name__, _allowed_symbols)
-- 
GitLab


From 03e5bf2d8c3f69782b345c5a849ffc1157f12cec Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Wed, 25 Apr 2018 12:06:28 -0700
Subject: [PATCH 423/450] Switch tf.keras.Model.save_weights to TensorFlow
 format for graph networks.

Some tweaks to support loading checkpoints into modified Python programs.

Relaxes the Checkpointable consistency check for object matching: if the same
object in the checkpoint matches two different Python objects, it will just
choose the first one that matches (based on whichever traversal of the Python
dependency graph it's doing). assert_consumed() on the status object will fail,
but this gives users the option of continuing anyway.

Adds a "weight-bearing layer index" dependency to graph networks which skips
Layers without weights, in addition to the regular layer index. This allows
users to add Layers without weights while not breaking checkpoints, as they
could when matching with flattened weights from HDF5 format.

Eventually I'd like to add a dependency structure which matches the topology of
the graph itself (so a Layer would have checkpoint dependencies on other Layers
it outputs to), but there are some subtleties before that's useful (it'd need
something like a secondary check that the Python classes match). I think the
scheme in this CL is robust enough for general use, and adding more dependencies
later can make it more robust now that users won't run into consistency check
errors (previously more dependencies would only make matching more picky).

PiperOrigin-RevId: 194277075
---
 .../keras/_impl/keras/engine/network.py       | 72 +++++++++----------
 .../keras/_impl/keras/engine/saving_test.py   | 35 +++------
 .../keras/_impl/keras/engine/sequential.py    | 32 +--------
 tensorflow/python/training/checkpointable.py  | 14 ++--
 .../python/training/checkpointable_utils.py   | 13 ++++
 .../training/checkpointable_utils_test.py     | 11 +--
 6 files changed, 76 insertions(+), 101 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/engine/network.py b/tensorflow/python/keras/_impl/keras/engine/network.py
index 9f8ee129aa..a0229be346 100644
--- a/tensorflow/python/keras/_impl/keras/engine/network.py
+++ b/tensorflow/python/keras/_impl/keras/engine/network.py
@@ -239,6 +239,8 @@ class Network(base_layer.Layer):
     self._layers = layers
     self._layers_by_depth = layers_by_depth
 
+    self._track_layers(layers)
+
     # Create the node linking internal inputs to internal outputs.
     base_layer.Node(
         outbound_layer=self,
@@ -298,6 +300,23 @@ class Network(base_layer.Layer):
     self.inputs = None
     self.built = False
 
+  def _track_layers(self, layers):
+    """Add Checkpointable dependencies on a list of Layers."""
+    weight_layer_index = 0
+    for layer_index, layer in enumerate(layers):
+      if layer.weights:
+        # Keep a separate index for layers which have weights. This allows users
+        # to insert Layers without weights anywhere in the network without
+        # breaking checkpoints.
+        self._track_checkpointable(
+            layer, name='layer_with_weights-%d' % weight_layer_index,
+            overwrite=True)
+        weight_layer_index += 1
+      # Even if it doesn't have weights, we should still track everything in
+      # case it has/will have Checkpointable dependencies.
+      self._track_checkpointable(
+          layer, name='layer-%d' % layer_index, overwrite=True)
+
   def __setattr__(self, name, value):
     if isinstance(value, (base_layer.Layer, Network)):
       try:
@@ -1153,14 +1172,15 @@ class Network(base_layer.Layer):
           - For every weight in the layer, a dataset
               storing the weight value, named after the weight tensor.
 
-    Currently the TensorFlow format is only supported for user-defined classes
-    inheriting from `tf.keras.Model`, and not for networks constructed from
-    inputs and outputs (using `tf.keras.Model(inputs, outputs)`).
-
     When saving in TensorFlow format, all objects referenced by the network are
-    saved in the same format as `tf.train.Checkpoint`, including any `Layer`s or
-    `Optimizer`s assigned to attributes in the constructor. See
-    `tf.train.Checkpoint`'s documentation for details.
+    saved in the same format as `tf.train.Checkpoint`, including any `Layer`
+    instances or `Optimizer` instances assigned to object attributes. For
+    networks constructed from inputs and outputs using `tf.keras.Model(inputs,
+    outputs)`, `Layer` instances used by the network are tracked/saved
+    automatically. For user-defined classes which inherit from `tf.keras.Model`,
+    `Layer` instances must be assigned to object attributes, typically in the
+    constructor. See the documentation of `tf.train.Checkpoint` and
+    `tf.keras.Model` for details.
 
     Arguments:
         filepath: String, path to the file to save the weights to. When saving
@@ -1169,12 +1189,9 @@ class Network(base_layer.Layer):
             weights to be saved in HDF5 format.
         overwrite: Whether to silently overwrite any existing file at the
             target location, or provide the user with a manual prompt.
-        save_format: Either 'tf' or 'h5'. If `None`, defaults to 'tf' for
-            user-defined classes inheriting from `tf.keras.Model` and 'h5' for
-            networks constructed from inputs and outputs. `filepath`s ending in
-            '.h5' or '.keras' always default to HDF5. Currently only 'h5' is
-            supported for networks constructed from inputs and outputs. Once
-            supported, the default for all networks will switch to 'tf'.
+        save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or
+            '.keras' will default to HDF5 if `save_format` is `None`. Otherwise
+            `None` defaults to 'tf'.
 
     Raises:
         ImportError: If h5py is not available when attempting to save in HDF5
@@ -1186,13 +1203,7 @@ class Network(base_layer.Layer):
       if filepath_is_h5:
         save_format = 'h5'
       else:
-        if self._is_graph_network:
-          # TODO(allenl): Handle loading by weight index and fix dependencies,
-          # then enable 'tensorflow' format by default for graph networks.
-          save_format = 'h5'
-        else:
-          # Subclassed models save in TensorFlow format by default.
-          save_format = 'tf'
+        save_format = 'tf'
     else:
       user_format = save_format.lower().strip()
       if user_format in ('tensorflow', 'tf'):
@@ -1214,10 +1225,6 @@ class Network(base_layer.Layer):
       raise ImportError(
           '`save_weights` requires h5py when saving in hdf5.')
     if save_format == 'tf':
-      if self._is_graph_network:
-        raise NotImplementedError(
-            'Networks constructed from inputs and outputs do not yet support '
-            'saving weights in the TensorFlow ("tf") save_format.')
       check_filepath = filepath + '.index'
     else:
       check_filepath = filepath
@@ -1273,19 +1280,12 @@ class Network(base_layer.Layer):
         ImportError: If h5py is not available and the weight file is in HDF5
             format.
     """
-    if self._is_graph_network:
-      # Graph networks do not currently support TensorFlow formatted weight
-      # files.
+    try:
+      pywrap_tensorflow.NewCheckpointReader(filepath)
+      save_format = 'tf'
+    except errors_impl.DataLossError:
+      # The checkpoint is not readable in TensorFlow format. Try HDF5.
       save_format = 'h5'
-    else:
-      save_format = None
-    if save_format is None:
-      try:
-        pywrap_tensorflow.NewCheckpointReader(filepath)
-        save_format = 'tf'
-      except errors_impl.DataLossError:
-        # The checkpoint is not readable in TensorFlow format. Try HDF5.
-        save_format = 'h5'
     if save_format == 'tf':
       status = self._checkpointable_saver.restore(filepath)
       if by_name:
diff --git a/tensorflow/python/keras/_impl/keras/engine/saving_test.py b/tensorflow/python/keras/_impl/keras/engine/saving_test.py
index 8764ae5e9c..edd296a281 100644
--- a/tensorflow/python/keras/_impl/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/saving_test.py
@@ -67,8 +67,10 @@ class TestWeightSavingAndLoading(test.TestCase):
       self.addCleanup(shutil.rmtree, temp_dir)
 
       no_extension_path = os.path.join(temp_dir, 'test')
-      with self.assertRaises(NotImplementedError):
-        model.save_weights(no_extension_path, save_format='tensorflow')
+      model.save_weights(no_extension_path, save_format='tf')
+      model.load_weights(no_extension_path)
+      y = model.predict(x)
+      self.assertAllClose(ref_y, y)
 
       if h5py is None:
         return  # Skip rest of test if H5py isn't available.
@@ -83,11 +85,6 @@ class TestWeightSavingAndLoading(test.TestCase):
       y = model.predict(x)
       self.assertAllClose(ref_y, y)
 
-      model.save_weights(no_extension_path)
-      model.load_weights(no_extension_path)
-      y = model.predict(x)
-      self.assertAllClose(ref_y, y)
-
       model.save_weights(no_extension_path, save_format='hdf5')
       model.load_weights(no_extension_path)
       y = model.predict(x)
@@ -490,8 +487,6 @@ class SubclassedModel(training.Model):
     return self.b_layer(self.x_layer(a))
 
 
-# TODO(allenl): The graph model tests in this TestCase are still saving in
-# hdf5. Get them to save in tensorflow format.
 class TestWeightSavingAndLoadingTFFormat(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -545,7 +540,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
       if not executing_eagerly:
         session.run([v.initializer for v in model.variables])
       ref_y = self.evaluate(ref_y_tensor)
-      model.save_weights(prefix)
+      model.save_weights(prefix, save_format='tf')
       for v in model.variables:
         self.evaluate(
             v.assign(random_ops.random_normal(shape=array_ops.shape(v))))
@@ -572,9 +567,6 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
       b = keras.layers.Dense(1)(x)
       return keras.models.Model(a, b)
 
-    if h5py is None:
-      self.skipTest('This test only works with h5py.')
-
     self._weight_loading_test_template(_make_graph_model)
 
   @test_util.run_in_graph_and_eager_modes()
@@ -582,7 +574,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
     self._weight_loading_test_template(SubclassedModel)
 
   def _new_layer_weight_loading_test_template(
-      self, first_model_fn, second_model_fn, restore_init_fn, by_name):
+      self, first_model_fn, second_model_fn, restore_init_fn):
     with self.test_session() as session:
       model = first_model_fn()
       temp_dir = self.get_temp_dir()
@@ -602,12 +594,12 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
       self.addCleanup(shutil.rmtree, temp_dir)
 
       second_model = second_model_fn()
-      second_model.load_weights(prefix, by_name=by_name)
+      second_model.load_weights(prefix)
       second_model(x)
       self.evaluate(restore_init_fn(second_model))
       second_model.save_weights(prefix)
       # Check that the second model's checkpoint loads into the original model
-      model.load_weights(prefix, by_name=by_name)
+      model.load_weights(prefix)
       y = self.evaluate(model(x))
       self.assertAllClose(ref_y, y)
 
@@ -627,12 +619,9 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
     def _restore_init_fn(restore_model):
       return [v.initializer for v in restore_model.layers[-1].variables]
 
-    if h5py is None:
-      self.skipTest('This test only works with h5py.')
-
     self._new_layer_weight_loading_test_template(
         _save_graph_model, _restore_graph_model,
-        _restore_init_fn, by_name=True)
+        _restore_init_fn)
 
   @test_util.run_in_graph_and_eager_modes()
   def test_weight_loading_graph_model_added_no_weight_layer(self):
@@ -650,12 +639,10 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
     def _restore_init_fn(restore_model):
       del restore_model  # unused
       return []
-    if h5py is None:
-      self.skipTest('This test only works with h5py.')
 
     self._new_layer_weight_loading_test_template(
         _save_graph_model, _restore_graph_model,
-        _restore_init_fn, by_name=False)
+        _restore_init_fn)
 
   @test_util.run_in_graph_and_eager_modes()
   def test_weight_loading_subclassed_model_added_layer(self):
@@ -676,7 +663,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
 
     self._new_layer_weight_loading_test_template(
         SubclassedModel, SubclassedModelRestore,
-        _restore_init_fn, by_name=False)
+        _restore_init_fn)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/engine/sequential.py b/tensorflow/python/keras/_impl/keras/engine/sequential.py
index bd13ca6713..8626626ca1 100644
--- a/tensorflow/python/keras/_impl/keras/engine/sequential.py
+++ b/tensorflow/python/keras/_impl/keras/engine/sequential.py
@@ -29,7 +29,6 @@ from tensorflow.python.keras._impl.keras.engine.input_layer import Input
 from tensorflow.python.keras._impl.keras.engine.input_layer import InputLayer
 from tensorflow.python.keras._impl.keras.engine.training import Model
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import checkpointable
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -193,36 +192,6 @@ class Sequential(Model):
       self.build()
     else:
       self._layers.append(layer)
-    # In implementing Checkpointable, Sequential does not track its Layers
-    # normally, since they may be added and removed (in pop()). Instead, it
-    # names everything on demand (gathering dependencies in
-    # _checkpoint_dependencies, and looking them up in
-    # _lookup_dependency). _handle_deferred_dependencies just checks whether an
-    # existing checkpoint load targets this Layer, it does not create a
-    # dependency on the Layer.
-    self._handle_deferred_dependencies(
-        name='layer-%d' % (len(self._layers) - 1), checkpointable=layer)
-
-  @property
-  def _checkpoint_dependencies(self):
-    """For implementing Checkpointable. Layers which should be saved."""
-    return super(Sequential, self)._checkpoint_dependencies + [
-        checkpointable.CheckpointableReference(
-            name='layer-%d' % layer_index, ref=layer)
-        for layer_index, layer in enumerate(self._layers)]
-
-  def _lookup_dependency(self, name):
-    """For implementing Checkpointable. Looks up a Layer."""
-    super_lookup = super(Sequential, self)._lookup_dependency(name=name)
-    if super_lookup is not None:
-      return super_lookup
-    if name.startswith('layer-'):
-      try:
-        return self._layers[int(name[6:])]
-      except IndexError:
-        return None
-    else:
-      return None
 
   def pop(self):
     """Removes the last layer in the model.
@@ -257,6 +226,7 @@ class Sequential(Model):
     if self.inputs:
       self._init_graph_network(self.inputs, self.outputs, name=self.name)
       self.built = True
+    self._track_layers(self._layers)
 
   def predict_proba(self, x, batch_size=32, verbose=0):
     """Generates class probability predictions for the input samples.
diff --git a/tensorflow/python/training/checkpointable.py b/tensorflow/python/training/checkpointable.py
index 0b8473742c..05afd37ccd 100644
--- a/tensorflow/python/training/checkpointable.py
+++ b/tensorflow/python/training/checkpointable.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_io_ops as io_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
 
@@ -119,6 +120,7 @@ class _CheckpointPosition(object):
       AssertionError: If another object is already bound to the `Object` proto.
     """
     checkpoint = self.checkpoint
+    checkpoint.all_python_objects.add(checkpointable)
     current_assignment = checkpoint.object_by_proto_id.get(self._proto_id, None)
     if current_assignment is None:
       checkpoint.object_by_proto_id[self._proto_id] = checkpointable
@@ -157,12 +159,12 @@ class _CheckpointPosition(object):
       # consistent (if the dependency DAG is not a tree then there are
       # multiple paths to the same object).
       if current_assignment is not checkpointable:
-        raise AssertionError(
-            ("Unable to load the checkpoint into this object graph. Either "
-             "the Checkpointable object references in the Python program "
-             "have changed in an incompatible way, or the checkpoint was "
-             "generated in an incompatible program.\n\nTwo checkpoint "
-             "references resolved to different objects (%s and %s).")
+        logging.warning(
+            ("Inconsistent references when loading the checkpoint into this "
+             "object graph. Either the Checkpointable object references in the "
+             "Python program have changed in an incompatible way, or the "
+             "checkpoint was generated in an incompatible program.\n\nTwo "
+             "checkpoint references resolved to different objects (%s and %s).")
             % (current_assignment, checkpointable))
       return False  # Not a new assignment
 
diff --git a/tensorflow/python/training/checkpointable_utils.py b/tensorflow/python/training/checkpointable_utils.py
index 13bd89d907..2a97b50fe7 100644
--- a/tensorflow/python/training/checkpointable_utils.py
+++ b/tensorflow/python/training/checkpointable_utils.py
@@ -84,6 +84,11 @@ class _CheckpointRestoreCoordinator(object):
     # (as objects with deferred dependencies will generally have references to
     # this object).
     self.object_by_proto_id = weakref.WeakValueDictionary()
+    # A set of all Python objects we've seen as dependencies, even if we didn't
+    # use them (for example because of inconsistent references when
+    # loading). Used to make status assertions fail when loading checkpoints
+    # that don't quite match.
+    self.all_python_objects = weakref.WeakSet()
     self.save_path = save_path
     self.dtype_map = dtype_map
     # When graph building, contains a list of ops to run to restore objects from
@@ -446,6 +451,14 @@ class CheckpointLoadStatus(_LoadStatus):
           ("Unused attributes in these objects (the attributes exist in the "
            "checkpoint but not in the objects): %s") % (
                self._checkpoint.unused_attributes.items(),))
+    unused_python_objects = (
+        set(self._checkpoint.all_python_objects)
+        - set(self._checkpoint.object_by_proto_id.values()))
+    if unused_python_objects:
+      raise AssertionError(
+          ("Some Python objects were not bound to checkpointed values, likely "
+           "due to changes in the Python program: %s")
+          % (unused_python_objects,))
     return self
 
   def run_restore_ops(self, session=None):
diff --git a/tensorflow/python/training/checkpointable_utils_test.py b/tensorflow/python/training/checkpointable_utils_test.py
index 29fcdb70b4..58e4b3cea5 100644
--- a/tensorflow/python/training/checkpointable_utils_test.py
+++ b/tensorflow/python/training/checkpointable_utils_test.py
@@ -808,13 +808,16 @@ class CheckpointingTests(test.TestCase):
     save_path = checkpointable_utils.CheckpointableSaver(save_root).save(
         os.path.join(checkpoint_directory, "ckpt"))
     load_root = checkpointable.Checkpointable()
-    checkpointable_utils.CheckpointableSaver(load_root).restore(save_path)
+    status = checkpointable_utils.CheckpointableSaver(load_root).restore(
+        save_path)
     load_root.dep_one = checkpointable.Checkpointable()
     load_root.dep_two = checkpointable.Checkpointable()
     load_root.dep_one.dep_three = checkpointable.Checkpointable()
-    with self.assertRaisesRegexp(AssertionError,
-                                 "resolved to different objects"):
-      load_root.dep_two.dep_three = checkpointable.Checkpointable()
+    load_root.dep_two.dep_three = checkpointable.Checkpointable()
+    checkpointable_utils.add_variable(
+        load_root.dep_one.dep_three, name="var", initializer=0.)
+    with self.assertRaises(AssertionError):
+      status.assert_consumed()
 
   @test_util.run_in_graph_and_eager_modes()
   def testObjectsCombined(self):
-- 
GitLab


From 285c6ca19f1ddb6ae26dcfd1e5c92018bf410f7a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 12:34:06 -0700
Subject: [PATCH 424/450] [XLA] Make the graph seed to be random when the hlo
 module seed is 0.

PiperOrigin-RevId: 194281233
---
 .../xla/service/elemental_ir_emitter.cc        | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 56e35e2604..38b5efa9fb 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -52,6 +52,13 @@ using tensorflow::strings::StrCat;
 
 namespace {
 
+int64 GlobalRandomValue() {
+  static auto* mu = new tensorflow::mutex();
+  static std::mt19937_64 rng{42};
+  tensorflow::mutex_lock l(*mu);
+  return rng();
+}
+
 llvm::Value* EmitReducePrecisionFloat(llvm::Value* x, int64 exponent_bits,
                                       int64 mantissa_bits,
                                       llvm::IRBuilder<>* ir_builder) {
@@ -1175,7 +1182,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeRngElementGenerator(
   llvm::Value* increment = ir_builder_->getInt(
       llvm::APInt(128, {0x14057B7EF767814F, 0x5851F42D4C957F2D}));
 
-  auto random_value = [hlo]() {
+  auto random_value_from_hlo = [hlo]() {
     const HloModule* module =
         hlo->IsFused() ? hlo->parent()->FusionInstruction()->parent()->parent()
                        : hlo->parent()->parent();
@@ -1197,10 +1204,15 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeRngElementGenerator(
       /*Ty=*/ir_builder_->getInt64Ty(),
       /*isConstant=*/false,
       /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
-      /*Initializer=*/ir_builder_->getInt64(random_value()),
+      /*Initializer=*/ir_builder_->getInt64(random_value_from_hlo()),
       /*Name=*/"state_ptr0");
+
+  // When the module config seed is 0, the expected result of a prng is a random
+  // value. Instead of using the random_value_from_hlo, we need a global random
+  // value as the graph seed. This is because if we use random_value_from_hlo
+  // here, then for a newly built hlo graph, it always gives the same number.
   uint64 graph_seed = hlo_module_config_.seed() != 0 ? hlo_module_config_.seed()
-                                                     : random_value();
+                                                     : GlobalRandomValue();
   llvm::GlobalVariable* state_ptr1 = new llvm::GlobalVariable(
       /*M=*/*module_,
       /*Ty=*/ir_builder_->getInt64Ty(),
-- 
GitLab


From 11eadddc01e677cb8591265c7d6273f155fef48f Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 25 Apr 2018 12:36:15 -0700
Subject: [PATCH 425/450] Disable tests for the parallel CPU backend

PiperOrigin-RevId: 194281504
---
 tensorflow/compiler/xla/tests/BUILD           |  4 ----
 tensorflow/compiler/xla/tests/build_defs.bzl  | 24 ++++++++-----------
 tensorflow/compiler/xla/tests/client_test.cc  |  3 +--
 .../compiler/xla/tests/dynamic_ops_test.cc    | 23 ++++--------------
 .../xla/tests/execution_profile_test.cc       |  3 +--
 .../xla/tests/gather_operation_test.cc        |  5 +---
 .../xla/tests/local_client_execute_test.cc    |  7 ++----
 tensorflow/compiler/xla/tests/prng_test.cc    |  6 ++---
 .../compiler/xla/tests/reduce_window_test.cc  |  5 ++--
 tensorflow/compiler/xla/tests/test_macros.h   |  8 -------
 tensorflow/compiler/xla/tests/tuple_test.cc   | 13 ++++------
 tensorflow/compiler/xla/tests/while_test.cc   |  4 ----
 .../xla/tests/xla_hlo_profile_test.cc         |  8 ++-----
 13 files changed, 30 insertions(+), 83 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 25bbde1677..ca8b3f9ffc 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -731,9 +731,6 @@ xla_test(
         "cpu": [
             "--xla_cpu_multi_thread_eigen=false",
         ],
-        "cpu_parallel": [
-            "--xla_cpu_multi_thread_eigen=false",
-        ],
     },
     shard_count = 20,
     tags = ["optonly"],
@@ -836,7 +833,6 @@ xla_test(
     backend_tags = {
         # TODO(b/31436974): Fix msan failure. Failed on 2016-09-12.
         "cpu": ["nomsan"],
-        "cpu_parallel": ["nomsan"],
     },
     shard_count = 30,
     deps = [
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index eac2eb286c..53f2c3bfbf 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -4,7 +4,7 @@ load("@local_config_cuda//cuda:build_defs.bzl", "cuda_is_configured")
 load("//tensorflow/compiler/xla/tests:plugin.bzl", "plugins")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
-all_backends = ["cpu", "cpu_parallel", "gpu"] + plugins.keys()
+all_backends = ["cpu", "gpu"] + plugins.keys()
 
 def filter_backends(backends):
   """Removes "gpu" from a backend list if CUDA is not enabled.
@@ -39,10 +39,10 @@ def xla_test(name,
              **kwargs):
   """Generates cc_test targets for the given XLA backends.
 
-  This rule generates a cc_test target for one or more XLA backends and also
-  a platform-agnostic cc_library rule. The arguments are identical to cc_test
-  with two additions: 'backends' and 'backend_args'. 'backends' specifies the
-  backends to generate tests for ("cpu", "cpu_parallel", "gpu"), and
+  This rule generates a cc_test target for one or more XLA backends and also a
+  platform-agnostic cc_library rule. The arguments are identical to cc_test with
+  two additions: 'backends' and 'backend_args'. 'backends' specifies the
+  backends to generate tests for ("cpu", "gpu"), and
   'backend_args'/'backend_tags' specifies backend-specific args parameters to
   use when generating the cc_test.
 
@@ -90,9 +90,9 @@ def xla_test(name,
     deps: Dependencies of the target.
     xla_test_library_deps: If set, the generated test targets will depend on the
       respective cc_libraries generated by the xla_test_library rule.
-    backends: A list of backends to generate tests for. Supported
-      values: "cpu", "cpu_parallel", "gpu". If this list is empty, the test will
-      be generated for all supported backends.
+    backends: A list of backends to generate tests for. Supported values: "cpu",
+      "gpu". If this list is empty, the test will be generated for all supported
+      backends.
     blacklisted_backends: A list of backends to NOT generate tests for.
     args: Test arguments for the target.
     tags: Tags for the target.
@@ -128,10 +128,6 @@ def xla_test(name,
     if backend == "cpu":
       backend_deps = ["//tensorflow/compiler/xla/service:cpu_plugin"]
       backend_deps += ["//tensorflow/compiler/xla/tests:test_macros_cpu"]
-    elif backend == "cpu_parallel":
-      backend_deps = ["//tensorflow/compiler/xla/service:cpu_plugin"]
-      backend_deps += ["//tensorflow/compiler/xla/tests:test_macros_cpu"]
-      this_backend_args += ["--xla_backend_extra_options=\"xla_cpu_parallel\""]
     elif backend == "gpu":
       backend_deps = ["//tensorflow/compiler/xla/service:gpu_plugin"]
       backend_deps += ["//tensorflow/compiler/xla/tests:test_macros_gpu"]
@@ -201,7 +197,7 @@ def xla_test_library(name,
     hdrs: Headers for the target.
     deps: Dependencies of the target.
     backends: A list of backends to generate libraries for.
-      Supported values: "cpu", "cpu_parallel", "gpu". If this list is empty, the
+      Supported values: "cpu", "gpu". If this list is empty, the
       library will be generated for all supported backends.
   """
 
@@ -210,7 +206,7 @@ def xla_test_library(name,
 
   for backend in filter_backends(backends):
     this_backend_copts = []
-    if backend in ["cpu", "cpu_parallel", "gpu"]:
+    if backend in ["cpu", "gpu"]:
       backend_deps = ["//tensorflow/compiler/xla/tests:test_macros_%s" % backend]
     elif backend in plugins:
       backend_deps = plugins[backend]["deps"]
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index 32e2f2c084..1e54471796 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -109,8 +109,7 @@ XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) {
                                      /*minor_to_major=*/{1, 0})));
 }
 
-XLA_TEST_F(ClientTest,
-        DISABLED_ON_CPU_PARALLEL(DISABLED_ON_GPU(ExecuteParallel))) {
+XLA_TEST_F(ClientTest, DISABLED_ON_GPU(ExecuteParallel)) {
   XlaComputation add_with_one_arg, mul_with_two_args, dot_with_one_arg;
   Shape shape = ShapeUtil::MakeShape(S32, {2, 2});
 
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 021fbcedb9..ff53a84588 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -470,13 +470,6 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
   template <class T>
   void RunR3Contiguous(std::vector<int32> operand_shape, int32 index,
                        int32 size) {
-#ifdef XLA_TEST_BACKEND_CPU_PARALLEL
-    // TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10.
-    if (std::is_same<bfloat16, T>::value) {
-      return;
-    }
-#endif
-
     const int32 kSeq = operand_shape[0];
     const int32 kBatch = operand_shape[1];
     const int32 kDim = operand_shape[2];
@@ -539,30 +532,22 @@ XLA_TEST_F(DynamicUpdateSliceTest, Int64R0) { TestR0<int64, float>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, UInt64R0) { TestR0<uint64, float>(); }
 
 // TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10.
-XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32R1BF16)) {
-  TestR1<int32, bfloat16>();
-}
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R1BF16) { TestR1<int32, bfloat16>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, Int32R1) { TestR1<int32, float>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, Int64R1) { TestR1<int64, float>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, UInt64R1) { TestR1<uint64, float>(); }
 
-// TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10.
-XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32R2BF16)) {
-  TestR2<int32, bfloat16>();
-}
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R2BF16) { TestR2<int32, bfloat16>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, Int32R2) { TestR2<int32, float>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, Int64R2) { TestR2<int64, int64>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, UInt64R2) { TestR2<uint64, int32>(); }
 
-// TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10.
-XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32R3BF16)) {
-  TestR3<int32, bfloat16>();
-}
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R3BF16) { TestR3<int32, bfloat16>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, Int32R3) { TestR3<int32, float>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, Int64R3) { TestR3<int64, int64>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, UInt64R3) { TestR3<uint64, uint64>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32WrapBF16)) {
+XLA_TEST_F(DynamicUpdateSliceTest, Int32WrapBF16) {
   TestWrap<int32, bfloat16>();
 }
 XLA_TEST_F(DynamicUpdateSliceTest, Int32Wrap) { TestWrap<int32, float>(); }
diff --git a/tensorflow/compiler/xla/tests/execution_profile_test.cc b/tensorflow/compiler/xla/tests/execution_profile_test.cc
index 644cbbf40f..c8cc8e40aa 100644
--- a/tensorflow/compiler/xla/tests/execution_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/execution_profile_test.cc
@@ -24,8 +24,7 @@ namespace {
 
 class ExecutionProfileTest : public ClientLibraryTestBase {};
 
-XLA_TEST_F(ExecutionProfileTest,
-           DISABLED_ON_CPU_PARALLEL(ExecuteWithExecutionProfile)) {
+XLA_TEST_F(ExecutionProfileTest, ExecuteWithExecutionProfile) {
   Shape shape = ShapeUtil::MakeShape(F32, {256, 256});
 
   TF_ASSERT_OK_AND_ASSIGN(
diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc
index 90496d55e6..4dd3acd9af 100644
--- a/tensorflow/compiler/xla/tests/gather_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc
@@ -401,10 +401,7 @@ ENTRY main {
 
 class GatherClientLibraryTest : public ClientLibraryTestBase {};
 
-// TODO(b/30671675): Asynchronous execution on stream is not yet supported on
-// GPU and CPU_PARALLEL.
-XLA_TEST_F(GatherClientLibraryTest,
-           DISABLED_ON_CPU_PARALLEL(DISABLED_ON_GPU(Basic))) {
+XLA_TEST_F(GatherClientLibraryTest, DISABLED_ON_GPU(Basic)) {
   // We create this HLO, but using the XlaBuilder API.
   //
   // ENTRY main {
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index 7e14e77366..26c33f7725 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -453,9 +453,7 @@ XLA_TEST_F(LocalClientExecuteTest, LargeTuple) {
   }
 }
 
-// TODO(b/66968986): Test times out on CPU parallel backend. Disabled
-// 2017-09-26.
-XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_CPU_PARALLEL(LargeNestedTuple)) {
+XLA_TEST_F(LocalClientExecuteTest, LargeNestedTuple) {
   // Construct and run a computation which takes a two-level nested tuple
   // parameter with a large fanout.
   const int kFanout = 40;
@@ -853,8 +851,7 @@ XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion64bit) {
 
 // TODO(b/34359662): Support infeed/outfeed on GPU and CPU parallel.
 // 2017-10-18.
-XLA_TEST_F(LocalClientExecuteTest,
-           DISABLED_ON_GPU(DISABLED_ON_CPU_PARALLEL(InfeedOutfeedTest))) {
+XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_GPU(InfeedOutfeedTest)) {
   ComputationBuilder builder(local_client_, TestName());
   const Shape shape = ShapeUtil::MakeShape(F32, {3});
   auto in = builder.Infeed(shape);
diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc
index 6aafb9fa6c..733d89fdcc 100644
--- a/tensorflow/compiler/xla/tests/prng_test.cc
+++ b/tensorflow/compiler/xla/tests/prng_test.cc
@@ -81,8 +81,7 @@ XLA_TEST_F(PrngTest, LargeU01) { UniformTest<float>(0, 1, {0x100, 0x100}); }
 XLA_TEST_F(PrngTest, TwelveValuesU524) { UniformTest<int32>(5, 24, {12}); }
 
 // TODO(b/71543667): Fix Rng ops on LLVM backends.
-XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU_PARALLEL(
-                         DISABLED_ON_CPU(ScalarBF16Tests)))) {
+XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16Tests))) {
   for (int64 seed = 0; seed < 100; ++seed) {
     // The largest negative number smaller than zero in bf16 that's not
     // denormalized.
@@ -105,8 +104,7 @@ XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU_PARALLEL(
 }
 
 // TODO(b/71543667): Fix Rng ops on LLVM backends.
-XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(
-                         DISABLED_ON_CPU_PARALLEL(ScalarBF16CountTests)))) {
+XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16CountTests))) {
   // There are 3 BF16 values in the range of [32.25, 33): 32.25, 32.5, 32.75,
   // they should get similar counts.
   bfloat16 low = static_cast<bfloat16>(32.25);
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 0a09766722..10a3da3a38 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -861,8 +861,7 @@ INSTANTIATE_TEST_CASE_P(
 class R4ReduceWindowAnyDimsTest : public R4ReduceWindowTest {};
 
 // TODO(b/72234705): Fix the test cases failed on CPU and GPU.
-XLA_TEST_P(R4ReduceWindowAnyDimsTest,
-           DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt)))) {
+XLA_TEST_P(R4ReduceWindowAnyDimsTest, DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt))) {
   DoIt();
 }
 
@@ -1151,7 +1150,7 @@ class R2ReduceWindowFailingCpuGpuBf16Test : public R2ReduceWindowTest {};
 
 // TODO(b/72234705): Fix the test cases failed on CPU and GPU.
 XLA_TEST_P(R2ReduceWindowFailingCpuGpuBf16Test,
-           DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt)))) {
+           DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt))) {
   DoIt();
 }
 
diff --git a/tensorflow/compiler/xla/tests/test_macros.h b/tensorflow/compiler/xla/tests/test_macros.h
index e2d406f66d..7ca99a9163 100644
--- a/tensorflow/compiler/xla/tests/test_macros.h
+++ b/tensorflow/compiler/xla/tests/test_macros.h
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 #define DISABLED_ON_CPU(X) X
-#define DISABLED_ON_CPU_PARALLEL(X) X
 #define DISABLED_ON_GPU(X) X
 #define DISABLED_ON_INTERPRETER(X) X
 
@@ -51,13 +50,6 @@ limitations under the License.
 # define DISABLED_ON_CPU(X) XLA_TEST_PASTE(DISABLED_, X)
 #endif  // XLA_TEST_BACKEND_CPU
 
-#ifdef XLA_TEST_BACKEND_CPU_PARALLEL
-# undef DISABLED_ON_CPU
-# define DISABLED_ON_CPU(X) XLA_TEST_PASTE(DISABLED_, X)
-# undef DISABLED_ON_CPU_PARALLEL
-# define DISABLED_ON_CPU_PARALLEL(X) XLA_TEST_PASTE(DISABLED_, X)
-#endif  // XLA_TEST_BACKEND_CPU_PARALLEL
-
 #ifdef XLA_TEST_BACKEND_GPU
 # undef DISABLED_ON_GPU
 # define DISABLED_ON_GPU(X) XLA_TEST_PASTE(DISABLED_, X)
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 61d0fa02ab..61be174653 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -269,7 +269,7 @@ XLA_TEST_F(TupleTest, TupleGTEToTupleToGTEAdd) {
   ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
 }
 
-XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnFalse)) {
+XLA_TEST_F(TupleTest, SelectBetweenTuplesOnFalse) {
   // Tests a selection between tuples with "false" path taken.
   XlaBuilder builder(TestName());
 
@@ -313,7 +313,7 @@ XLA_TEST_F(TupleTest, TuplesInAMap) {
   ComputeAndCompareR1<float>(&b, {-99.0f, 101.0f, 214.41f}, {}, error_spec_);
 }
 
-XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnTrue)) {
+XLA_TEST_F(TupleTest, SelectBetweenTuplesOnTrue) {
   // Tests a selection between tuples with "true" path taken.
   XlaBuilder builder(TestName());
 
@@ -350,7 +350,7 @@ XLA_TEST_F(TupleTest, SelectBetweenTuplesElementResult) {
 }
 
 // Cascaded selects between tuple types.
-XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesCascaded)) {
+XLA_TEST_F(TupleTest, SelectBetweenTuplesCascaded) {
   //
   //                       vec1     vec2   vec2     vec1
   //                        |        |      |        |
@@ -390,8 +390,7 @@ XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesCascaded)) {
   ComputeAndCompareR1<float>(&builder, {3.f, 6.f, 9.f}, {}, error_spec_);
 }
 
-XLA_TEST_F(TupleTest,
-           DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesReuseConstants)) {
+XLA_TEST_F(TupleTest, SelectBetweenTuplesReuseConstants) {
   // Similar to SelectBetweenTuples, but the constants are shared between the
   // input tuples.
   XlaBuilder builder(TestName());
@@ -516,10 +515,8 @@ XLA_TEST_F(TupleTest, ComplexTuples) {
 
 class TupleHloTest : public HloTestBase {};
 
-// Disabled on CPU parallel because that's broken and will be removed soon.
 // Disabled on the interpreter because bitcast doesn't exist on the interpreter.
-TEST_F(TupleHloTest,
-       DISABLED_ON_INTERPRETER(DISABLED_ON_CPU_PARALLEL(BitcastAfterGTE))) {
+TEST_F(TupleHloTest, DISABLED_ON_INTERPRETER(BitcastAfterGTE)) {
   const char* testcase = R"(
     HloModule m
 
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 1e18b56799..336fed27c6 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -1321,10 +1321,6 @@ void BM_WhileLoop(int num_iters) {
   }
 }
 
-// TODO(b/32470510): Benchmark fails on parallel CPU backend.
-#ifndef XLA_TEST_BACKEND_CPU_PARALLEL
 BENCHMARK(BM_WhileLoop);
-#endif
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index 837a01e873..8354bb71cb 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -175,8 +175,7 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
   XLA_VLOG_LINES(4, *profile_output);
 }
 
-// TODO(b/71364943): This test exposes a bug in the parallel CPU backend.
-XLA_TEST_F(HloProfileTest, DISABLED_ON_CPU_PARALLEL(ProfileSingleComputation)) {
+XLA_TEST_F(HloProfileTest, ProfileSingleComputation) {
   const int64 m = 256, k = 256, n = 256;
   Shape lhs_shape = ShapeUtil::MakeShape(F32, {m, k});
   Shape rhs_shape = ShapeUtil::MakeShape(F32, {m, k});
@@ -239,12 +238,9 @@ XLA_TEST_F(HloProfileTest, DISABLED_ON_CPU_PARALLEL(ProfileSingleComputation)) {
   EXPECT_TRUE(HasTrops(tanh_profile));
 }
 
-// TODO(b/71364943): This test exposes a bug in the parallel CPU backend.
-//
 // TODO(b/71544591): The GPU backend does not record cycles spent in on Hlo
 // instructions "interior" to while nodes.
-XLA_TEST_F(HloProfileTest,
-           DISABLED_ON_GPU(DISABLED_ON_CPU_PARALLEL(ProfileWhileComputation))) {
+XLA_TEST_F(HloProfileTest, DISABLED_ON_GPU(ProfileWhileComputation)) {
   const int64 size = 256;
   Shape matrix_shape = ShapeUtil::MakeShape(F32, {size, size});
   Shape while_result_shape =
-- 
GitLab


From e0c8c11e03f33e81c044dbe6499ef548d9966f22 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 12:55:21 -0700
Subject: [PATCH 426/450] Adding info to FakeQuant ops in graphviz.

PiperOrigin-RevId: 194283908
---
 tensorflow/contrib/lite/toco/dump_graphviz.cc | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tensorflow/contrib/lite/toco/dump_graphviz.cc b/tensorflow/contrib/lite/toco/dump_graphviz.cc
index c289ddcd92..5bb0e3ba4d 100644
--- a/tensorflow/contrib/lite/toco/dump_graphviz.cc
+++ b/tensorflow/contrib/lite/toco/dump_graphviz.cc
@@ -259,6 +259,19 @@ NodeProperties GetPropertiesForOperator(const Operator& op) {
       node_properties.color = Color(0xC5, 0x39, 0x29);  // Bolder color
       break;
     }
+    case OperatorType::kFakeQuant: {
+      const auto& fakequant_op = static_cast<const FakeQuantOperator&>(op);
+      node_properties.color = Color(0xC5, 0x39, 0x29);  // Bolder color
+      if (fakequant_op.minmax) {
+        AppendF(&node_properties.label, "\\n%dbit [%g,%g]",
+                fakequant_op.num_bits, fakequant_op.minmax->min,
+                fakequant_op.minmax->max);
+      } else {
+        AppendF(&node_properties.label, "\\n%dbit [?,?]",
+                fakequant_op.num_bits);
+      }
+      break;
+    }
     default:
       node_properties.color = Color(0xDB, 0x44, 0x37);
       break;
-- 
GitLab


From 4bb74d42a4e8d0e9694ac88396b3b9b548ccd9de Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Wed, 25 Apr 2018 13:27:30 -0700
Subject: [PATCH 427/450] Remove StreamExecutorUtil::ConvertStatus.

This function is a nop; StreamExecutor's Status is the same as
TensorFlow's Status.

PiperOrigin-RevId: 194288432
---
 tensorflow/compiler/jit/kernels/xla_launch_op.cc |  2 +-
 tensorflow/compiler/jit/xla_device.cc            |  2 +-
 tensorflow/core/common_runtime/gpu/gpu_device.cc |  2 +-
 tensorflow/core/common_runtime/gpu/gpu_init.cc   |  7 +------
 tensorflow/core/util/stream_executor_util.h      | 11 -----------
 5 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index 03ae09ee8b..049d170fa4 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -69,7 +69,7 @@ Status XlaLocalLaunchOp::BuildCompilationCache(OpKernelContext* ctx,
 
   auto platform = se::MultiPlatformManager::PlatformWithId(platform_id_);
   if (!platform.ok()) {
-    return StreamExecutorUtil::ConvertStatus(platform.status());
+    return platform.status();
   }
   xla::LocalClientOptions client_options;
   client_options.set_platform(platform.ValueOrDie());
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 3e27cd39c6..c814b7eb02 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -120,7 +120,7 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
 
   auto platform = se::MultiPlatformManager::PlatformWithName(platform_name);
   if (!platform.ok()) {
-    return StreamExecutorUtil::ConvertStatus(platform.status());
+    return platform.status();
   }
 
   const DeviceAttributes attrs = Device::BuildDeviceAttributes(
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 4abec7c3d5..1fa33991f7 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -1333,7 +1333,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
 
     auto executor = GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, cuda_gpu_id);
     if (!executor.ok()) {
-      return StreamExecutorUtil::ConvertStatus(executor.status());
+      return executor.status();
     }
 
     auto stream_exec = executor.ValueOrDie();
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.cc b/tensorflow/core/common_runtime/gpu/gpu_init.cc
index ff96891a2a..e0ec93a98e 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_init.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_init.cc
@@ -29,12 +29,7 @@ limitations under the License.
 namespace tensorflow {
 
 Status ValidateGPUMachineManager() {
-  auto result = se::MultiPlatformManager::PlatformWithName("CUDA");
-  if (!result.ok()) {
-    return StreamExecutorUtil::ConvertStatus(result.status());
-  }
-
-  return Status::OK();
+  return se::MultiPlatformManager::PlatformWithName("CUDA").status();
 }
 
 se::Platform* GPUMachineManager() {
diff --git a/tensorflow/core/util/stream_executor_util.h b/tensorflow/core/util/stream_executor_util.h
index 7d715de499..4787bcf6de 100644
--- a/tensorflow/core/util/stream_executor_util.h
+++ b/tensorflow/core/util/stream_executor_util.h
@@ -34,17 +34,6 @@ class StreamExecutorUtil {
     T* ptr = reinterpret_cast<T*>(const_cast<char*>(t.tensor_data().data()));
     return se::DeviceMemory<T>(se::DeviceMemoryBase(ptr, t.TotalBytes()));
   }
-
-  // Converts from a StreamExecutor Status to a TensorFlow Status.
-  //
-  // This assumes that the error codes between the two implementations
-  // match.
-  static Status ConvertStatus(const se::port::Status& s) {
-    return s.ok() ? Status::OK()
-                  : Status(static_cast<tensorflow::error::Code>(
-                               static_cast<int>(s.code())),
-                           s.error_message());
-  }
 };
 
 }  // namespace tensorflow
-- 
GitLab


From 4e0bf0c07ac80f205fa87c643890d5f1ef6cde2b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 13:55:23 -0700
Subject: [PATCH 428/450] Use a built-in function shape_n instead of running
 many array_ops.shape and stacking the results.

PiperOrigin-RevId: 194292637
---
 .../boosted_trees/python/training/functions/gbdt_batch.py   | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index 4bde7f3e33..08c1dcdd02 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -970,10 +970,8 @@ class GradientBoostedDecisionTreeModel(object):
       # Stack all the inputs to one tensor per type.
       # This is a workaround for the slowness of graph building in tf.cond.
       # See (b/36554864).
-      split_sizes = array_ops.stack([
-          array_ops.shape(partition_id)[0]
-          for partition_id in partition_ids_list
-      ])
+      split_sizes = array_ops.reshape(
+          array_ops.shape_n(partition_ids_list), [-1])
       partition_ids = array_ops.concat(partition_ids_list, axis=0)
       gains = array_ops.concat(gains_list, axis=0)
       split_infos = array_ops.concat(split_info_list, axis=0)
-- 
GitLab


From 536299d7e8af0988a59a5ab179b4c1de5a32d137 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 13:58:54 -0700
Subject: [PATCH 429/450] Automated g4 rollback of changelist 193731341

PiperOrigin-RevId: 194293187
---
 tensorflow/compiler/xla/service/BUILD         |   3 -
 .../xla/service/computation_layout.cc         |   7 +-
 .../compiler/xla/service/computation_layout.h |   5 +-
 .../compiler/xla/service/hlo_instruction.h    |   8 -
 .../compiler/xla/service/layout_assignment.cc | 328 +++++-------------
 .../compiler/xla/service/layout_assignment.h  |  65 +---
 tensorflow/compiler/xla/service/service.cc    |   5 +-
 .../compiler/xla/service/tuple_simplifier.cc  |  25 +-
 8 files changed, 121 insertions(+), 325 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 5edb9440c0..d55da3686c 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1955,12 +1955,10 @@ cc_library(
     deps = [
         ":computation_layout",
         ":hlo",
-        ":hlo_dce",
         ":hlo_graph_dumper",
         ":hlo_pass",
         ":logical_buffer",
         ":tuple_points_to_analysis",
-        ":tuple_simplifier",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -2437,7 +2435,6 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/computation_layout.cc b/tensorflow/compiler/xla/service/computation_layout.cc
index cb61f3da39..d2d4f14fce 100644
--- a/tensorflow/compiler/xla/service/computation_layout.cc
+++ b/tensorflow/compiler/xla/service/computation_layout.cc
@@ -23,15 +23,12 @@ limitations under the License.
 
 namespace xla {
 
-ComputationLayout::ComputationLayout(const ProgramShape& program_shape,
-                                     bool ignore_layouts)
+ComputationLayout::ComputationLayout(const ProgramShape& program_shape)
     : result_layout_(program_shape.result()) {
   for (auto& shape : program_shape.parameters()) {
     parameter_layouts_.emplace_back(shape);
   }
-  if (ignore_layouts) {
-    SetToDefaultLayout();
-  }
+  SetToDefaultLayout();
 }
 
 void ComputationLayout::SetToDefaultLayout() {
diff --git a/tensorflow/compiler/xla/service/computation_layout.h b/tensorflow/compiler/xla/service/computation_layout.h
index 53c3a3f7b7..80e102411c 100644
--- a/tensorflow/compiler/xla/service/computation_layout.h
+++ b/tensorflow/compiler/xla/service/computation_layout.h
@@ -34,9 +34,8 @@ class ComputationLayout {
  public:
   // Constructs a ComputationLayout from a ProgramShape. The layouts of the
   // parameters and results are set to the default layout. Layouts in the
-  // ProgramShape are ignored if ignore_layouts is true.
-  explicit ComputationLayout(const ProgramShape& program_shape,
-                             bool ignore_layouts = true);
+  // ProgramShape are ignored.
+  explicit ComputationLayout(const ProgramShape& program_shape);
 
   // Returns the layout of a particular parameter.
   const ShapeLayout& parameter_layout(int64 param_no) const {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index f3da3fc256..a5e9aecb9e 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -956,14 +956,6 @@ class HloInstruction {
   void clear_sharding() { sharding_ = nullptr; }
   // Return true if this operator has a sharding assigned.
   bool has_sharding() const { return sharding_ != nullptr; }
-  // Checks whether the instruction has compatible sharding with the other
-  // instruction.
-  bool has_compatible_sharding(const HloInstruction* other) const {
-    if (!has_sharding()) {
-      return !other->has_sharding();
-    }
-    return other->has_sharding() ? sharding() == other->sharding() : false;
-  }
 
   // When creating a new instruction which either replaces, or shifts up (kCopy
   // insertion case), another instruction, we need to make sure the certain
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 7067b6f86a..2494569db5 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -31,12 +31,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
-#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -402,9 +400,9 @@ string LayoutConstraints::ToString() const {
 }
 
 Status LayoutAssignment::AddMandatoryConstraints(
-    const ComputationLayout* computation_layout,
-    ChannelLayoutConstraints* channel_constraints, HloComputation* computation,
-    LayoutConstraints* constraints) {
+    const ComputationLayout& computation_layout,
+    const ChannelLayoutConstraints* channel_constraints,
+    HloComputation* computation, LayoutConstraints* constraints) {
   VLOG(3) << "Adding mandatory layout constraints to computation "
           << computation->name();
 
@@ -426,16 +424,11 @@ Status LayoutAssignment::AddMandatoryConstraints(
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
           instruction->outfeed_shape(), instruction, 0));
     } else if (instruction->opcode() == HloOpcode::kParameter) {
-      if (computation_layout != nullptr) {
-        const ShapeLayout& parameter_layout =
-            computation_layout->parameter_layout(
-                instruction->parameter_number());
-        if (parameter_layout.LayoutIsSet()) {
-          // Parameter layouts must match the respective layout in
-          // ComputationLayout, if there is one.
-          shape_with_layout = &parameter_layout.shape();
-        }
-      }
+      // Parameter layouts must match the respective layout in
+      // ComputationLayout.
+      shape_with_layout =
+          &computation_layout.parameter_layout(instruction->parameter_number())
+               .shape();
     }
     if (shape_with_layout != nullptr) {
       TF_RETURN_IF_ERROR(
@@ -500,8 +493,9 @@ Status LayoutAssignment::AddMandatoryConstraints(
       HloComputation* body = instruction->while_body();
       HloComputation* condition = instruction->while_condition();
       const HloInstruction* init = instruction->operand(0);
-      ComputationLayout& body_layout = FindOrDie(computation_layouts_, body);
-      ComputationLayout& condition_layout =
+      const ComputationLayout& body_layout =
+          FindOrDie(computation_layouts_, body);
+      const ComputationLayout& condition_layout =
           FindOrDie(computation_layouts_, condition);
 
       // Check a few invariants irrespective of layout.
@@ -514,19 +508,26 @@ Status LayoutAssignment::AddMandatoryConstraints(
                                    condition_layout.parameter_shape(0)));
       DCHECK(ShapeUtil::Compatible(body_layout.result_shape(), init->shape()));
 
-      if (body_layout.result_layout() != body_layout.parameter_layout(0)) {
-        VLOG(2) << "Reset %while body parameter layout: body=" << body->name()
-                << " while=" << instruction->name()
-                << " shape=" << body_layout.result_layout().ToString();
-        *body_layout.mutable_parameter_layout(0) = body_layout.result_layout();
+      // Return error if earlier layout assignment of the embedded computations
+      // has produced conflicting layouts.
+      if (!ShapeUtil::Equal(body_layout.result_shape(),
+                            body_layout.parameter_shape(0))) {
+        return InternalError(
+            "Parameter and result of body computation %s of while instruction "
+            "%s have different layouts: %s vs %s",
+            body->name().c_str(), instruction->name().c_str(),
+            ShapeUtil::HumanString(body_layout.result_shape()).c_str(),
+            ShapeUtil::HumanString(body_layout.parameter_shape(0)).c_str());
       }
-      if (condition_layout.parameter_layout(0) !=
-          body_layout.parameter_layout(0)) {
-        VLOG(2) << "Reset %while condition parameter layout: cond="
-                << condition->name() << " while=" << instruction->name()
-                << " shape=" << body_layout.parameter_layout(0).ToString();
-        *condition_layout.mutable_parameter_layout(0) =
-            body_layout.parameter_layout(0);
+      if (!ShapeUtil::Equal(body->root_instruction()->shape(),
+                            condition->parameter_instruction(0)->shape())) {
+        return InternalError(
+            "Parameter of condition computation %s of while instruction "
+            "%s does not match body computation %s result: %s vs %s",
+            condition->name().c_str(), instruction->name().c_str(),
+            body->name().c_str(),
+            ShapeUtil::HumanString(condition_layout.parameter_shape(0)).c_str(),
+            ShapeUtil::HumanString(body_layout.result_shape()).c_str());
       }
 
       // Constrain the output and the operand of the while instruction to match
@@ -556,20 +557,7 @@ Status LayoutAssignment::AddMandatoryConstraints(
                                    true_computation_layout.parameter_shape(0)));
       DCHECK(ShapeUtil::Compatible(
           false_operand->shape(), false_computation_layout.parameter_shape(0)));
-      if (true_computation_layout.result_layout() !=
-          false_computation_layout.result_layout()) {
-        // We assign layouts in DFS fashion, so the true and false computations
-        // might have negotiated a different layout. But for the conditional
-        // instruction POV the layout must match, so we run again on the false
-        // computation, this time with proper computation layout.
-        VLOG(2) << "Reset %conditional false computation result layout: "
-                   "false_computation="
-                << false_computation->name()
-                << " conditional=" << instruction->name() << " shape="
-                << true_computation_layout.result_layout().ToString();
-        *false_computation_layout.mutable_result_layout() =
-            true_computation_layout.result_layout();
-      }
+
       TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(
           true_computation_layout.result_shape(), instruction));
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
@@ -605,14 +593,10 @@ Status LayoutAssignment::AddMandatoryConstraints(
       }
     }
   }
-  // Finally set the result layout to match ComputationLayout, if there is one.
-  if (computation_layout != nullptr) {
-    const ShapeLayout& result_layout = computation_layout->result_layout();
-    if (result_layout.LayoutIsSet()) {
-      TF_RETURN_IF_ERROR(constraints->SetResultLayout(result_layout.shape()));
-    }
-  }
-  return Status::OK();
+
+  // Finally set the result layout to match ComputationLayout.
+  return constraints->SetResultLayout(
+      computation_layout.result_layout().shape());
 }
 
 namespace {
@@ -776,7 +760,6 @@ StatusOr<HloInstruction*> LayoutAssignment::CreateCopyWithNewLayout(
     HloInstruction* copy =
         instruction->parent()->AddInstruction(HloInstruction::CreateUnary(
             instruction->shape(), HloOpcode::kCopy, instruction));
-    RegisterAddedCopy(copy);
     SetupCopiedInstruction(*instruction, copy, {});
     LayoutUtil::ClearLayout(copy->mutable_shape());
     TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
@@ -800,19 +783,13 @@ Status LayoutAssignment::CopyOperandIfLayoutsDiffer(
   TF_RET_CHECK(LayoutUtil::HasLayout(operand->shape()));
 
   if (ShapeUtil::Equal(operand_layout.shape(), operand->shape())) {
-    VLOG(5) << "Operand " << operand->ToString() << " layout matches in "
-            << instruction->ToString();
     // Operand layout already matches our constraint. Nothing to do.
     return Status::OK();
   }
-  VLOG(4) << "Operand " << operand->ToString() << " layout does not match "
-          << operand_layout.ToString() << " in " << instruction->ToString();
 
   TF_ASSIGN_OR_RETURN(HloInstruction * operand_copy,
                       CreateCopyWithNewLayout(operand_layout.shape(), operand));
 
-  VLOG(4) << "New copy of " << operand->ToString() << " is "
-          << operand_copy->ToString();
   return instruction->ReplaceOperandWith(operand_no, operand_copy);
 }
 
@@ -919,16 +896,15 @@ Status LayoutAssignment::CheckLayouts(HloModule* module) {
       }
     }
   }
-  // Finally verify the result layout, if set, matches the layout of the entry
+
+  // Finally verify the result layout matches the layout of the entry
   // computation root.
-  const ShapeLayout& result_layout =
+  TF_RET_CHECK(ShapeUtil::Equal(
+      module->entry_computation()->root_instruction()->shape(),
       FindOrDie(computation_layouts_, module->entry_computation())
-          .result_layout();
-  if (result_layout.LayoutIsSet()) {
-    TF_RET_CHECK(ShapeUtil::Equal(
-        module->entry_computation()->root_instruction()->shape(),
-        result_layout.shape()));
-  }
+          .result_layout()
+          .shape()));
+
   return Status::OK();
 }
 
@@ -937,13 +913,18 @@ LayoutAssignment::LayoutAssignment(
     ChannelLayoutConstraints* channel_constraints)
     : entry_computation_layout_(entry_computation_layout),
       channel_layout_constraints_(channel_constraints) {
-  VLOG(1) << "Entry computation layout given to layout assignment: "
+  VLOG(1) << "entry computation layout given to layout assignment: "
           << entry_computation_layout_->ToString();
   // Layouts of all parameter instructions must be set.
   for (const ShapeLayout& parameter_layout :
        entry_computation_layout_->parameter_layouts()) {
     CHECK(parameter_layout.LayoutIsSet());
   }
+  // If the result layout is not set, then choose the default.
+  // TODO(b/29118294): Choose a better layout in this case.
+  if (!entry_computation_layout_->result_layout().LayoutIsSet()) {
+    entry_computation_layout_->mutable_result_layout()->SetToDefaultLayout();
+  }
 }
 
 std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
@@ -1503,60 +1484,16 @@ Status LayoutAssignment::AssignLayouts(const LayoutConstraints& constraints,
   return Status::OK();
 }
 
-Status LayoutAssignment::CalculateComputationLayout(
-    HloComputation* computation) {
-  ComputationLayout computation_layout(computation->ComputeProgramShape(),
-                                       /*ignore_layouts=*/false);
-  InsertOrDie(&computation_layouts_, computation, computation_layout);
-  VLOG(2) << "  Calculated ComputationLayout = "
-          << computation_layout.ToString();
-  return Status::OK();
-}
-
-Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) {
-  // Clear existing layouts of the instructions.  All layouts must be assigned
-  // by the LayoutAssignment pass, except for those on infeeds, parameters,
-  // and the computation result. The latter two are specified in
-  // computation_layout, so we only need to keep the existing layouts for
-  // infeeds.  Clearing the layouts here avoids hiding potential bugs in the
-  // layout assignment pass that may accidently use the existing layout.
-  for (HloInstruction* instruction : computation->instructions()) {
-    if (instruction->opcode() == HloOpcode::kBitcast) {
-      // bitcasts are inherently layout sensitive and so a bitcast instruction
-      // present in the IR before layout assignment is a bug.
-      return InternalError(
-          "Unexpected bitcast operation seen during layout assignment: %s.",
-          instruction->ToString().c_str());
-    }
-    if (instruction->opcode() != HloOpcode::kInfeed) {
-      LayoutUtil::ClearLayout(instruction->mutable_shape());
-    }
-  }
-  return Status::OK();
-}
-
 Status LayoutAssignment::RunOnComputation(
-    ComputationLayout* computation_layout,
+    const ComputationLayout& computation_layout,
     const TuplePointsToAnalysis& points_to_analysis,
     HloComputation* computation,
     ChannelLayoutConstraints* channel_constraints) {
+  DCHECK(computation_layout.LayoutIsSet());
+  InsertOrDie(&computation_layouts_, computation, computation_layout);
   VLOG(2) << "LayoutAssignment::RunOnComputation(" << computation->name()
           << ")";
-  TF_RETURN_IF_ERROR(ClearComputationLayouts(computation));
-  if (computation_layout != nullptr) {
-    auto it = computation_layouts_.find(computation);
-    if (it == computation_layouts_.end()) {
-      VLOG(2) << "  New ComputationLayout = " << computation_layout->ToString();
-      computation_layouts_.emplace(computation, *computation_layout);
-    } else {
-      TF_RET_CHECK(computation_layout == &it->second ||
-                   computation_layout == entry_computation_layout_);
-      VLOG(2) << "  Existing ComputationLayout = "
-              << computation_layout->ToString();
-    }
-  } else {
-    VLOG(2) << "  No ComputationLayout specified (will be calculated)";
-  }
+  VLOG(2) << "  ComputationLayout = " << computation_layout.ToString();
 
   // Construct LayoutConstraints with all layout constraints of the computation.
   LayoutConstraints constraints(points_to_analysis, computation);
@@ -1599,19 +1536,12 @@ Status LayoutAssignment::RunOnComputation(
     CHECK_LT(constraints.unconstrained_buffer_ids().size(),
              unconstrained_count);
   }
+
   // All logical buffers should have constraints at this point. All that
   // remains is assign the constraints to the buffers and infer layouts for
   // aliased buffers.
   TF_RETURN_IF_ERROR(AssignLayouts(constraints, computation));
 
-  // If the computation layout wasn't specified, now it is the time to compute
-  // it according to the parameters and root instruction layouts.
-  // This allows the first pass through this API to record the best flowing
-  // layout to parameters and root instruction.
-  if (computation_layout == nullptr) {
-    TF_RETURN_IF_ERROR(CalculateComputationLayout(computation));
-  }
-
   // Record the layouts assigned for any communication ops in
   // channel_constraints so that they are constrained for future modules.
   for (HloInstruction* instruction : computation->instructions()) {
@@ -1626,34 +1556,6 @@ Status LayoutAssignment::RunOnComputation(
   return Status::OK();
 }
 
-Status LayoutAssignment::PropagateComputationLayouts(
-    HloComputation* computation, ComputationLayout* computation_layout) {
-  ComputationLayout computed_computation_layout(
-      computation->ComputeProgramShape(),
-      /*ignore_layouts=*/false);
-  for (int64 i = 0; i < computed_computation_layout.parameter_count(); ++i) {
-    ShapeLayout* param_layout = computation_layout->mutable_parameter_layout(i);
-    if (!param_layout->LayoutIsSet()) {
-      VLOG(4) << "Assigning layout to parameter " << i << " of computation "
-              << computation->name() << ": "
-              << computed_computation_layout.parameter_layout(i).ToString();
-      *param_layout = computed_computation_layout.parameter_layout(i);
-    } else {
-      TF_RET_CHECK(computed_computation_layout.parameter_layout(i) ==
-                   *param_layout);
-    }
-  }
-  ShapeLayout* result_layout = computation_layout->mutable_result_layout();
-  if (!result_layout->LayoutIsSet()) {
-    VLOG(4) << "Assigning result layout of computation " << computation->name()
-            << ": " << computed_computation_layout.result_layout().ToString();
-    *result_layout = computed_computation_layout.result_layout();
-  } else {
-    TF_RET_CHECK(computed_computation_layout.result_layout() == *result_layout);
-  }
-  return Status::OK();
-}
-
 StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
   VLOG(2) << "Running layout assignment on module " << module->name();
   XLA_VLOG_LINES(3, module->ToString());
@@ -1662,45 +1564,52 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
                                 "before layout assignment",
                                 module->config().debug_options());
   }
-  TF_RETURN_IF_ERROR(Init());
-
-  // We do two passes. The first one we pass a nullptr ComputationLayout to
-  // the RunOnComputation() calls (for non entry computations), and we register
-  // the ComputationLayout which are naturally flowing in DFS fashion to the
-  // parameters and root instruction.
-  // Walking in DFS mode though, means that we can end up with incorrect layouts
-  // when seen from an outer instruction, which has across-computation
-  // constraints to impose.
-  // For example, the kWhile instruction needs to enforce the same layouts for
-  // the parameters and root of the bosy, as well as the condition parameters.
-  // Similarly, the kConditional instruction needs to enforce the same layouts
-  // for the root of the true and false computations.
-  // So in the first pass, while allowing the layouts to flow to parameters and
-  // root, we also fix up the eventually inconsistent ComputationLayout, which
-  // will be then made mandatory by the second pass.
-  for (int64 i = 0; i < 2; ++i) {
-    TF_RETURN_IF_ERROR(ClearPreviousPassSideEffects(module));
-    TF_ASSIGN_OR_RETURN(auto points_to_analysis,
-                        TuplePointsToAnalysis::Run(module));
-    for (auto* computation : module->MakeComputationPostOrder()) {
-      if (computation->IsFusionComputation()) {
-        continue;
+
+  TF_ASSIGN_OR_RETURN(auto points_to_analysis,
+                      TuplePointsToAnalysis::Run(module));
+
+  // Assign layouts to computations in an order such that a callee computation
+  // is handled before its caller computation. This ensures that the layout of
+  // all callers of a computation will agree.
+  std::list<HloComputation*> computation_post_order =
+      module->MakeComputationPostOrder();
+  for (auto* computation : module->MakeComputationPostOrder()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
+    // Clear existing layouts of the instructions.  All layouts must be assigned
+    // by the LayoutAssignment pass, except for those on infeeds, parameters,
+    // and the computation result. The latter two are specified in
+    // computation_layout, so we only need to keep the existing layouts for
+    // infeeds.  Clearing the layouts here avoids hiding potential bugs in the
+    // layout assignment pass that may accidently use the existing layout.
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kBitcast) {
+        // bitcasts are inherently layout sensitive and so a bitcast instruction
+        // present in the IR before layout assignment is a bug.
+        return InternalError(
+            "Unexpected bitcast operation seen during layout assignment: %s.",
+            instruction->ToString().c_str());
       }
-      if (computation == module->entry_computation()) {
-        TF_RETURN_IF_ERROR(RunOnComputation(
-            entry_computation_layout_, *points_to_analysis,
-            module->entry_computation(), channel_layout_constraints_));
-      } else {
-        ComputationLayout* computation_layout =
-            (i == 0) ? nullptr : &FindOrDie(computation_layouts_, computation);
-        TF_RETURN_IF_ERROR(RunOnComputation(computation_layout,
-                                            *points_to_analysis, computation,
-                                            channel_layout_constraints_));
+      if (instruction->opcode() != HloOpcode::kInfeed) {
+        LayoutUtil::ClearLayout(instruction->mutable_shape());
       }
     }
+    if (computation == module->entry_computation()) {
+      TF_RETURN_IF_ERROR(RunOnComputation(
+          *entry_computation_layout_, *points_to_analysis,
+          module->entry_computation(), channel_layout_constraints_));
+    } else {
+      ComputationLayout computation_layout(computation->ComputeProgramShape());
+      // Setting all embedded computations to the default layout is potentially
+      // suboptimal.
+      computation_layout.SetToDefaultLayout();
+      TF_RETURN_IF_ERROR(RunOnComputation(computation_layout,
+                                          *points_to_analysis, computation,
+                                          channel_layout_constraints_));
+    }
   }
-  TF_RETURN_IF_ERROR(PropagateComputationLayouts(module->entry_computation(),
-                                                 entry_computation_layout_));
+
   TF_RETURN_IF_ERROR(CheckLayouts(module));
 
   VLOG(3) << "After layout assignment:";
@@ -1710,54 +1619,9 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
                                 "after layout assignment",
                                 module->config().debug_options());
   }
+
   // All layouts are reset then reassigned by this pass.
   return true;
 }
 
-Status LayoutAssignment::Init() {
-  computation_layouts_.clear();
-  return Status::OK();
-}
-
-Status LayoutAssignment::ClearPreviousPassSideEffects(HloModule* module) {
-  // Clear all the copies which have been added, and all the related
-  // instructions (like GTE and tuples).
-  int64 removed_copies = 0;
-  for (HloComputation* computation : module->computations()) {
-    for (HloInstruction* instruction :
-         computation->MakeInstructionPostOrder()) {
-      if (instruction->opcode() == HloOpcode::kCopy &&
-          added_copies_.count(instruction) > 0) {
-        VLOG(5) << "Removing added copy: " << instruction->ToString();
-        TF_RETURN_IF_ERROR(
-            instruction->ReplaceAllUsesWith(instruction->mutable_operand(0)));
-        TF_RETURN_IF_ERROR(computation->RemoveInstruction(instruction));
-        ++removed_copies;
-      }
-    }
-  }
-  added_copies_.clear();
-  if (removed_copies > 0) {
-    TupleSimplifier tuple_simplifier;
-    HloDCE dce;
-    TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
-    TF_RETURN_IF_ERROR(dce.Run(module).status());
-  }
-  return Status::OK();
-}
-
-Status LayoutAssignment::AddCopyForOperand(HloInstruction* instruction,
-                                           int64 operand_number) {
-  HloInstruction* operand = instruction->mutable_operand(operand_number);
-  if (operand->opcode() != HloOpcode::kCopy || operand->user_count() > 1) {
-    HloInstruction* copy =
-        instruction->parent()->AddInstruction(HloInstruction::CreateUnary(
-            operand->shape(), HloOpcode::kCopy, operand));
-    SetupCopiedInstruction(*operand, copy, {});
-    LayoutUtil::ClearLayout(copy->mutable_shape());
-    TF_RETURN_IF_ERROR(instruction->ReplaceOperandWith(operand_number, copy));
-  }
-  return Status::OK();
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 8b4e07995a..ae4986d6ad 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -39,7 +39,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -363,15 +362,12 @@ class LayoutAssignment : public HloPassInterface {
       int64 operand_no);
 
  private:
-  // Initializes the layout assignment object for a new Run() call.
-  Status Init();
-
   // Adds constraints which must be satisfied for correctness on all
   // backends. Called once prior to propagating constraints.
-  Status AddMandatoryConstraints(const ComputationLayout* computation_layout,
-                                 ChannelLayoutConstraints* channel_constraints,
-                                 HloComputation* computation,
-                                 LayoutConstraints* constraints);
+  Status AddMandatoryConstraints(
+      const ComputationLayout& computation_layout,
+      const ChannelLayoutConstraints* channel_constraints,
+      HloComputation* computation, LayoutConstraints* constraints);
 
   // This method can be overridden to add backend-specific constraints to the
   // layout of the instructions of a computation. This method is called after
@@ -382,12 +378,10 @@ class LayoutAssignment : public HloPassInterface {
   }
 
   // Construct contraints and assign layouts to all instructions in the
-  // computation satisfying the given ComputationLayout, if not nullptr.
-  // Otherwise the ComputationLayout will be calculated by propagating the
-  // computation instruction contraints.
-  // Layouts constraints are added, then propagated until all LogicalBuffers in
-  // the computation are constrained.
-  Status RunOnComputation(ComputationLayout* computation_layout,
+  // computation satisfying the given ComputationLayout. Layouts constraints are
+  // added, then propagated until all LogicalBuffers in the computation are
+  // constrained.
+  Status RunOnComputation(const ComputationLayout& computation_layout,
                           const TuplePointsToAnalysis& points_to_analysis,
                           HloComputation* computation,
                           ChannelLayoutConstraints* channel_constraints);
@@ -408,25 +402,6 @@ class LayoutAssignment : public HloPassInterface {
   // necessary conditions.
   Status CheckLayouts(HloModule* module);
 
-  // Computes the ComputationLayout of the given computation based of the
-  // layouts assigned to parameters and root instruction, and inserts it to the
-  // computation_layouts_ map.
-  Status CalculateComputationLayout(HloComputation* computation);
-
-  // Clears all the layouts which can be cleared within a computation.
-  Status ClearComputationLayouts(HloComputation* computation);
-
-  // Clears the side effects of a previous pass, like added copy instructions.
-  Status ClearPreviousPassSideEffects(HloModule* module);
-
-  // Propagates the layouts computed by the layout assignment pass on the given
-  // computation, to the computation layout passed in to this API.
-  // This API propagates missing layout, and also checks that the caller
-  // specified have been respected, by comparing those with the parameters and
-  // root computation instruction.
-  Status PropagateComputationLayouts(HloComputation* computation,
-                                     ComputationLayout* computation_layout);
-
   ComputationLayout* entry_computation_layout_;
 
  protected:
@@ -443,37 +418,21 @@ class LayoutAssignment : public HloPassInterface {
   // Creates and returns a copy of the given instruction with a different
   // layout. Tuple-shaped instructions will be deep-copied, and the last Tuple
   // instruction producing the copy is returned.
-  StatusOr<HloInstruction*> CreateCopyWithNewLayout(
+  static StatusOr<HloInstruction*> CreateCopyWithNewLayout(
       const Shape& shape_with_layout, HloInstruction* instruction);
 
   // Creates a copy of the given operand if the operand's layout does not match
   // the given layout. This copy replaces the use in the given instruction.
   // Tuple operands will be deep-copied.
-  Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
-                                    HloInstruction* instruction,
-                                    int64 operand_no);
-
-  // Registers a copy instruction added by the layout assignment pass.
-  void RegisterAddedCopy(HloInstruction* copy) {
-    CHECK_EQ(copy->opcode(), HloOpcode::kCopy);
-    added_copies_.insert(copy);
-  }
-
-  // Adds a copy for the operand of an instruction, unless such operand is
-  // already a copy, and has a single user (which is forcibly the instruction
-  // itself).
-  Status AddCopyForOperand(HloInstruction* instruction, int64 operand_number);
+  static Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
+                                           HloInstruction* instruction,
+                                           int64 operand_no);
 
   // Map containing the layouts of all computations assigned so
   // far. Computations are handled in a topological sort where computations are
   // handled before their caller instructions so the layouts of caller
   // instructions can be set to match the computation.
   std::map<HloComputation*, ComputationLayout> computation_layouts_;
-
-  // Every copy added to the module by the layout assignment pass is registered
-  // here.
-  tensorflow::gtl::FlatSet<HloInstruction*> added_copies_;
-
   ChannelLayoutConstraints* channel_layout_constraints_;
 };
 
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index e8403c9e95..086bd61dd0 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -308,10 +308,7 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
         computation_layout->mutable_result_layout()->CopyLayoutFromShape(
             shape_with_output_layout));
   } else {
-    // TODO(b/78356948): We are forcing the default layout here. We should fix
-    // clients which expect a default layout, to be explicit about it, by
-    // passing the proper ExecutionOptions with shape_with_output_layout set.
-    computation_layout->mutable_result_layout()->SetToDefaultLayout();
+    computation_layout->mutable_result_layout()->Clear();
   }
 
   config->set_replica_count(options_.number_of_replicas());
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.cc b/tensorflow/compiler/xla/service/tuple_simplifier.cc
index d668855084..113c2e2bd9 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.cc
@@ -69,7 +69,6 @@ StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
       //       Tuple
       //
       HloInstruction* top_tuple = nullptr;
-      HloInstruction* first_gte = nullptr;
       bool can_simplify = true;
       for (int64 operand_number = 0;
            operand_number < instruction->operand_count(); ++operand_number) {
@@ -79,17 +78,11 @@ StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
           can_simplify = false;
           break;
         }
-        if (first_gte == nullptr) {
-          first_gte = operand;
-        } else if (!first_gte->has_compatible_sharding(operand)) {
-          can_simplify = false;
-          break;
-        }
+
         if (top_tuple == nullptr) {
           top_tuple = operand->mutable_operand(0);
           if (!ShapeUtil::Compatible(top_tuple->shape(),
-                                     instruction->shape()) ||
-              !instruction->has_compatible_sharding(top_tuple)) {
+                                     instruction->shape())) {
             can_simplify = false;
             break;
           }
@@ -115,17 +108,15 @@ StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
       //          |
       //         GTE
       if (instruction->operand(0)->opcode() == HloOpcode::kTuple) {
+        changed = true;
         HloInstruction* element_source =
             instruction->mutable_operand(0)->mutable_operand(
                 instruction->tuple_index());
-        if (instruction->has_compatible_sharding(element_source)) {
-          changed = true;
-          TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(element_source));
-          for (HloInstruction* user : element_source->users()) {
-            if (user->opcode() == HloOpcode::kTuple ||
-                user->opcode() == HloOpcode::kGetTupleElement) {
-              worklist.push(user);
-            }
+        TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(element_source));
+        for (HloInstruction* user : element_source->users()) {
+          if (user->opcode() == HloOpcode::kTuple ||
+              user->opcode() == HloOpcode::kGetTupleElement) {
+            worklist.push(user);
           }
         }
       }
-- 
GitLab


From 152f7163470ad53f372257c49c7ae88d774fcbaf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 14:03:12 -0700
Subject: [PATCH 430/450] Automated g4 rollback of changelist 193788768

PiperOrigin-RevId: 194293938
---
 tensorflow/contrib/image/kernels/image_ops.cc |  7 +--
 tensorflow/contrib/image/kernels/image_ops.h  |  2 +-
 tensorflow/contrib/image/ops/image_ops.cc     | 54 ++-----------------
 .../python/kernel_tests/image_ops_test.py     | 30 -----------
 .../contrib/image/python/ops/image_ops.py     | 39 ++++++--------
 5 files changed, 24 insertions(+), 108 deletions(-)

diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc
index ae4b1ba62a..c2e32da133 100644
--- a/tensorflow/contrib/image/kernels/image_ops.cc
+++ b/tensorflow/contrib/image/kernels/image_ops.cc
@@ -70,7 +70,6 @@ class ImageProjectiveTransform : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor& images_t = ctx->input(0);
     const Tensor& transform_t = ctx->input(1);
-    const Tensor& output_dim = ctx->input(2);
     OP_REQUIRES(ctx, images_t.shape().dims() == 4,
                 errors::InvalidArgument("Input images must have rank 4"));
     OP_REQUIRES(ctx,
@@ -84,11 +83,7 @@ class ImageProjectiveTransform : public OpKernel {
     auto images = images_t.tensor<T, 4>();
     auto transform = transform_t.matrix<float>();
     Tensor* output_t;
-    // Image is NHWC format.
-    auto output_shape = images_t.shape();
-    output_shape.set_dim(1, output_dim.vec<int>()(0));
-    output_shape.set_dim(2, output_dim.vec<int>()(1));
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_t));
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, images_t.shape(), &output_t));
     auto output = output_t->tensor<T, 4>();
     (FillProjectiveTransform<Device, T>(interpolation_))(
         ctx->eigen_device<Device>(), &output, images, transform);
diff --git a/tensorflow/contrib/image/kernels/image_ops.h b/tensorflow/contrib/image/kernels/image_ops.h
index 2320329b92..ad50133061 100644
--- a/tensorflow/contrib/image/kernels/image_ops.h
+++ b/tensorflow/contrib/image/kernels/image_ops.h
@@ -161,7 +161,7 @@ struct FillProjectiveTransform {
   void operator()(const Device& device, OutputType* output,
                   const InputType& images,
                   const TransformsType& transform) const {
-    output->device(device) = output->generate(
+    output->device(device) = images.generate(
         ProjectiveGenerator<Device, T>(images, transform, interpolation_));
   }
 };
diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc
index 295908d44b..ebdcaea7ab 100644
--- a/tensorflow/contrib/image/ops/image_ops.cc
+++ b/tensorflow/contrib/image/ops/image_ops.cc
@@ -19,55 +19,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
-namespace {
-
-// Sets output[0] to shape [batch_dim,height,width,channel_dim], where
-// height and width come from the size_tensor.
-Status SetOutputToSizedImage(InferenceContext* c, DimensionHandle batch_dim,
-                             int size_input_idx, DimensionHandle channel_dim) {
-  // Verify shape of size input.
-  ShapeHandle size;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(size_input_idx), 1, &size));
-  DimensionHandle unused;
-  TF_RETURN_IF_ERROR(c->WithValue(c->Dim(size, 0), 2, &unused));
-
-  // Get size values from the size tensor.
-  const Tensor* size_tensor = c->input_tensor(size_input_idx);
-  DimensionHandle width;
-  DimensionHandle height;
-  if (size_tensor == nullptr) {
-    width = c->UnknownDim();
-    height = c->UnknownDim();
-  } else {
-    // TODO(petewarden) - Remove once we have constant evaluation in C++ only.
-    if (size_tensor->dtype() != DT_INT32) {
-      return errors::InvalidArgument(
-          "Bad size input type for SetOutputToSizedImage: Expected DT_INT32 "
-          "but got ",
-          DataTypeString(size_tensor->dtype()), " for input #", size_input_idx,
-          " in ", c->DebugString());
-    }
-    auto vec = size_tensor->vec<int32>();
-    height = c->MakeDim(vec(0));
-    width = c->MakeDim(vec(1));
-  }
-  c->set_output(0, c->MakeShape({batch_dim, height, width, channel_dim}));
-  return Status::OK();
-}
-
-Status ResizeShapeFn(InferenceContext* c) {
-  ShapeHandle input;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
-  return SetOutputToSizedImage(c, c->Dim(input, 0), 2 /* size_input_idx */,
-                               c->Dim(input, 3));
-}
-
-}  // namespace
-
 // TODO(ringwalt): Add a "fill_mode" argument with "constant", "mirror", etc.
 // TODO(ringwalt): Add a "fill_constant" argument for constant mode (default 0).
 // TODO(ringwalt): Add an "output_shape" argument. This is sufficient to
@@ -75,11 +29,13 @@ Status ResizeShapeFn(InferenceContext* c) {
 REGISTER_OP("ImageProjectiveTransform")
     .Input("images: dtype")
     .Input("transforms: float32")
-    .Input("output_shape: int32")
     .Attr("dtype: {uint8, int32, int64, float32, float64}")
     .Attr("interpolation: string")
     .Output("transformed_images: dtype")
-    .SetShapeFn(ResizeShapeFn)
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Applies the given transform to each of the images.
 
@@ -93,7 +49,7 @@ If one row of `transforms` is `[a0, a1, a2, b0, b1, b2, c0, c1]`, then it maps
 the *output* point `(x, y)` to a transformed *input* point
 `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`, where
 `k = c0 x + c1 y + 1`. If the transformed point lays outside of the input
-image, the output pixel is set to 0.
+image, the output pixel is set to 0. The output is the same size as the input,
 
 images: 4D `Tensor`, input image(s) in NHWC format.
 transforms: 2D `Tensor`, projective transform(s) to apply to the image(s).
diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
index c0151d320f..b50177ae56 100644
--- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
@@ -195,40 +195,10 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
           x_init_value=test_image)
       self.assertLess(left_err, 1e-10)
 
-  def _test_grad_different_shape(self, input_shape, output_shape):
-    with self.test_session():
-      test_image_shape = input_shape
-      test_image = np.random.randn(*test_image_shape)
-      test_image_tensor = constant_op.constant(
-          test_image, shape=test_image_shape)
-      test_transform = image_ops.angles_to_projective_transforms(
-          np.pi / 2, 4, 4)
-
-      if len(output_shape) == 2:
-        resize_shape = output_shape
-      elif len(output_shape) == 3:
-        resize_shape = output_shape[0:2]
-      elif len(output_shape) == 4:
-        resize_shape = output_shape[1:3]
-      output = image_ops.transform(
-          images=test_image_tensor,
-          transforms=test_transform,
-          output_shape=resize_shape)
-      left_err = gradient_checker.compute_gradient_error(
-          test_image_tensor,
-          test_image_shape,
-          output,
-          output_shape,
-          x_init_value=test_image)
-      self.assertLess(left_err, 1e-10)
-
   def test_grad(self):
     self._test_grad([16, 16])
     self._test_grad([4, 12, 12])
     self._test_grad([3, 4, 12, 12])
-    self._test_grad_different_shape([16, 16], [8, 8])
-    self._test_grad_different_shape([4, 12, 3], [8, 24, 3])
-    self._test_grad_different_shape([3, 4, 12, 3], [3, 8, 24, 3])
 
 
 class BipartiteMatchTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index d3c114a88d..cd984c8054 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -212,11 +212,7 @@ def translations_to_projective_transforms(translations, name=None):
         axis=1)
 
 
-def transform(images,
-              transforms,
-              interpolation="NEAREST",
-              output_shape=None,
-              name=None):
+def transform(images, transforms, interpolation="NEAREST", name=None):
   """Applies the given transform(s) to the image(s).
 
   Args:
@@ -233,10 +229,6 @@ def transform(images,
        the transform mapping input points to output points. Note that gradients
        are not backpropagated into transformation parameters.
     interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR".
-    output_shape: Output dimesion after the transform, [height, width].
-       If None, output is the same size as input image.
-
-    name: The name of the op.
 
   Returns:
     Image(s) with the same type and shape as `images`, with the given
@@ -263,13 +255,6 @@ def transform(images,
     else:
       raise TypeError("Images should have rank between 2 and 4.")
 
-    if output_shape is None:
-      output_shape = array_ops.shape(images)[1:3]
-    elif len(output_shape) != 2:
-      raise TypeError(
-          "output_shape must either be None or a vector of 2 elements. %s" %
-          str(output_shape))
-
     if len(transform_or_transforms.get_shape()) == 1:
       transforms = transform_or_transforms[None]
     elif transform_or_transforms.get_shape().ndims is None:
@@ -280,7 +265,7 @@ def transform(images,
     else:
       raise TypeError("Transforms should have rank 1 or 2.")
     output = gen_image_ops.image_projective_transform(
-        images, transforms, output_shape, interpolation=interpolation.upper())
+        images, transforms, interpolation=interpolation.upper())
     if len(image_or_images.get_shape()) == 2:
       return output[0, :, :, 0]
     elif len(image_or_images.get_shape()) == 3:
@@ -390,6 +375,14 @@ def _image_projective_transform_grad(op, grad):
 
   if image_or_images.dtype.base_dtype not in _IMAGE_DTYPES:
     raise TypeError("Invalid dtype %s." % image_or_images.dtype)
+  if len(image_or_images.get_shape()) == 2:
+    images = image_or_images[None, :, :, None]
+  elif len(image_or_images.get_shape()) == 3:
+    images = image_or_images[None, :, :, :]
+  elif len(image_or_images.get_shape()) == 4:
+    images = image_or_images
+  else:
+    raise TypeError("Images should have rank between 2 and 4")
   if len(transform_or_transforms.get_shape()) == 1:
     transforms = transform_or_transforms[None]
   elif len(transform_or_transforms.get_shape()) == 2:
@@ -402,11 +395,13 @@ def _image_projective_transform_grad(op, grad):
   inverse = linalg_ops.matrix_inverse(transforms)
   transforms = matrices_to_flat_transforms(inverse)
   output = gen_image_ops.image_projective_transform(
-      images=grad,
-      transforms=transforms,
-      output_shape=array_ops.shape(image_or_images)[1:3],
-      interpolation=interpolation)
-  return [output, None, None]
+      grad, transforms, interpolation=interpolation)
+  if len(image_or_images.get_shape()) == 2:
+    return [output[0, :, :, 0], None]
+  elif len(image_or_images.get_shape()) == 3:
+    return [output[0, :, :, :], None]
+  else:
+    return [output, None]
 
 
 def bipartite_match(distance_mat,
-- 
GitLab


From e52706d1696faa2ab926c2d91a0d85ec99dac314 Mon Sep 17 00:00:00 2001
From: Younghee Kwon <youngheek@google.com>
Date: Wed, 25 Apr 2018 14:25:30 -0700
Subject: [PATCH 431/450] Fixing contrib.boosted_trees in Windows (module not
 linked) Trying to fix #14292

PiperOrigin-RevId: 194297723
---
 tensorflow/contrib/cmake/python_modules.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 2554b3a6e0..6468bed497 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -129,7 +129,11 @@ tensorflow/contrib/boosted_trees/kernels
 tensorflow/contrib/boosted_trees/ops
 tensorflow/contrib/boosted_trees/proto
 tensorflow/contrib/boosted_trees/python
+tensorflow/contrib/boosted_trees/python/kernel_tests
 tensorflow/contrib/boosted_trees/python/ops
+tensorflow/contrib/boosted_trees/python/training
+tensorflow/contrib/boosted_trees/python/training/functions
+tensorflow/contrib/boosted_trees/python/utils
 tensorflow/contrib/checkpoint
 tensorflow/contrib/checkpoint/python
 tensorflow/contrib/cloud
-- 
GitLab


From 4112f2409f79c0e4581bb070c4a7c660ce4d3a3e Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 25 Apr 2018 14:36:17 -0700
Subject: [PATCH 432/450] Remove the parallel cpu backend

PiperOrigin-RevId: 194299356
---
 tensorflow/compiler/xla/service/cpu/BUILD     |  50 --
 .../compiler/xla/service/cpu/cpu_compiler.cc  | 258 +++------
 .../compiler/xla/service/cpu/cpu_options.cc   |   7 -
 .../compiler/xla/service/cpu/cpu_options.h    |   1 -
 .../cpu/cpu_parallelization_preparation.cc    | 192 -------
 .../cpu/cpu_parallelization_preparation.h     |  80 ---
 .../compiler/xla/service/cpu/ir_emitter.cc    |  21 +-
 .../compiler/xla/service/cpu/ir_emitter.h     |   2 -
 .../service/cpu/parallel_cpu_executable.cc    | 528 ------------------
 .../xla/service/cpu/parallel_cpu_executable.h | 137 -----
 10 files changed, 69 insertions(+), 1207 deletions(-)
 delete mode 100644 tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc
 delete mode 100644 tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h
 delete mode 100644 tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
 delete mode 100644 tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h

diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 246b802861..04fda3b2df 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -89,12 +89,10 @@ cc_library(
         ":cpu_instruction_fusion",
         ":cpu_layout_assignment",
         ":cpu_options",
-        ":cpu_parallelization_preparation",
         ":disassembler",
         ":dot_op_emitter",
         ":ir_emission_utils",
         ":ir_emitter",
-        ":parallel_cpu_executable",
         ":parallel_task_assignment",
         ":simple_orc_jit",
         "//tensorflow/compiler/xla:literal_util",
@@ -232,35 +230,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "parallel_cpu_executable",
-    srcs = ["parallel_cpu_executable.cc"],
-    hdrs = [
-        "parallel_cpu_executable.h",
-    ],
-    deps = [
-        ":cpu_runtime",
-        ":shape_partition",
-        ":simple_orc_jit",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
-        "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_execution_profile",
-        "//tensorflow/compiler/xla/service:logical_buffer",
-        "//tensorflow/compiler/xla/service:shaped_buffer",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
-        "@llvm//:orc_jit",
-    ],
-)
-
 cc_library(
     name = "ir_emitter",
     srcs = [
@@ -661,25 +630,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "cpu_parallelization_preparation",
-    srcs = ["cpu_parallelization_preparation.cc"],
-    hdrs = [
-        "cpu_parallelization_preparation.h",
-    ],
-    deps = [
-        ":ir_emission_utils",
-        ":parallel_task_assignment",
-        ":shape_partition",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_cost_analysis",
-        "//tensorflow/compiler/xla/service:hlo_pass",
-        "//tensorflow/core:lib",
-    ],
-)
-
 cc_library(
     name = "ir_emission_utils",
     srcs = ["ir_emission_utils.cc"],
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index e8472fd36b..3c0c367df3 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -56,12 +56,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
-#include "tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h"
 #include "tensorflow/compiler/xla/service/cpu/disassembler.h"
 #include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emitter.h"
-#include "tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h"
 #include "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
@@ -308,10 +306,7 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
       module->config().intra_op_parallelism_threads() > 0
           ? module->config().intra_op_parallelism_threads()
           : tensorflow::port::NumSchedulableCPUs();
-  if (options::CpuParallelBackendRequested(module->config())) {
-    pipeline.AddPass<ParallelizationPreparation>(max_parallelism,
-                                                 ShapeSizeBytesFunction());
-  } else if (!is_aot_compile) {
+  if (!is_aot_compile) {
     // Run ParallelTaskAssigner to assign parallel tasks to HLOs in module.
     // Note this is not run for AOT because it would bring in thread pool
     // and thread synchronization dependencies which would likely increase
@@ -329,13 +324,6 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
   pipeline.AddPass<HloDCE>();
   pipeline.AddPass<FlattenCallGraph>();
   pipeline.AddPass<CpuCopyInsertion>();
-  if (options::CpuParallelBackendRequested(module->config())) {
-    // Re-run the outlining, in case any copies were inserted into the entry
-    // computation.
-    pipeline.AddPass<ParallelizationPreparation>(max_parallelism,
-                                                 ShapeSizeBytesFunction());
-    pipeline.AddPass<CpuCopyInsertion>();
-  }
   pipeline.AddPass<HloDCE>();
   return pipeline.Run(module).status();
 }
@@ -522,190 +510,80 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   const string xla_dump_optimized_hlo_proto_to =
       module->config().debug_options().xla_dump_optimized_hlo_proto_to();
 
-  if (options::CpuParallelBackendRequested(module->config())) {
-    VLOG(1) << "Using parallel cpu backend";
-
-    // Run buffer analysis on the HLO graph. This analysis figures out which
-    // temporary buffers are required to run the computation.
-    // DependencyHloOrdering is used for the parallel emitter because the order
-    // of HLO instruction execution is not known ahead of time.
-    // DependencyHloOrdering is the most conservative partial order and only
-    // uses data dependencies for determining order.
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<BufferAssignment> assignment,
-        BufferAssigner::Run(
-            module.get(), xla::MakeUnique<DependencyHloOrdering>(module.get()),
-            BufferSizeBytesFunction(), memory_alignment));
-    // BufferAssignment::ToString() includes a header, so no need for us to
-    // print one ourselves.
-    XLA_VLOG_LINES(2, assignment->ToString());
-
-    if (!xla_dump_optimized_hlo_proto_to.empty()) {
-      HloProto proto = MakeHloProto(*module, *assignment);
-      TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-          proto, xla_dump_optimized_hlo_proto_to, module->name()));
-    }
-
-    // If we are using the parallel CPU backend, we need to create map from
-    // HloInstruction to the corresponding generated function name.
-    std::map<HloComputation*, HloInstruction*> parallel_computations;
-    std::unordered_map<const HloInstruction*, std::unique_ptr<unsigned char[]>>
-        aligned_constants;
-    for (auto instruction : entry_computation->MakeInstructionPostOrder()) {
-      // Parameters and constants don't get their own computation.
-      if (instruction->opcode() == HloOpcode::kParameter) {
-        continue;
-      }
-      if (instruction->opcode() == HloOpcode::kConstant) {
-        // Copy the constant out of the ProtocolBuffer so that we can give it a
-        // higher alignment.
-        const void* data = instruction->literal().untyped_data();
-        int64 size = CpuExecutable::ShapeSizeBytes(instruction->shape());
-        auto iter = aligned_constants.emplace(
-            instruction, xla::MakeUnique<unsigned char[]>(size));
-        CHECK_EQ(iter.second, true);
-        unsigned char* aligned_data = iter.first->second.get();
-        memcpy(aligned_data, data, size);
-        continue;
-      }
-      // The parallel preparation should have ensured that the top-level
-      // computation consists solely of Call instructions.
-      TF_RET_CHECK(instruction->opcode() == HloOpcode::kCall)
-          << module->ToString();
-      HloComputation* to_apply = instruction->to_apply();
-      parallel_computations.emplace(to_apply, instruction);
-    }
-
-    IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
-                         std::move(instruction_to_profile_idx),
-                         std::move(computation_to_profile_idx),
-                         jit->target_machine(), jit->external_constant_pool());
-
-    std::unique_ptr<HloInstructionMap<string>> function_names(
-        new HloInstructionMap<string>());
-    for (auto embedded_computation :
-         entry_computation->MakeEmbeddedComputationsList()) {
-      if (embedded_computation->IsFusionComputation()) {
-        continue;
-      }
-      auto parallel_computation_iter =
-          parallel_computations.find(embedded_computation);
-      // All parallel computations are considered to be an entry computation for
-      // IR generation purposes.
-      bool computation_is_parallel =
-          parallel_computation_iter != parallel_computations.end();
-      TF_ASSIGN_OR_RETURN(
-          llvm::Function * ir_function,
-          ir_emitter.EmitComputation(
-              embedded_computation, embedded_computation->name(),
-              /*is_top_level_computation=*/computation_is_parallel,
-              /*instruction_order=*/nullptr));
-      // If this computation is parallel, remember it in the function name map.
-      // This way we know what function to execute when we try to run code for
-      // the Call instruction.
-      if (computation_is_parallel) {
-        HloInstruction* call_instruction = parallel_computation_iter->second;
-        InsertOrDie(function_names.get(), call_instruction,
-                    llvm_ir::AsString(ir_function->getName()));
-      }
-    }
-
-    string ir_module_string;
-    if (embed_ir_in_executable) {
-      ir_module_string = llvm_ir::DumpModuleToString(*llvm_module);
-    }
-    TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
-
-    // JIT compile the LLVM IR module to in-memory machine code.
-    jit->AddModule(std::move(llvm_module));
-    cpu_executable.reset(new ParallelCpuExecutable(
-        std::move(jit), std::move(assignment), std::move(module),
-        std::move(function_names), std::move(aligned_constants),
-        std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map)));
-
-    if (embed_ir_in_executable) {
-      static_cast<CpuExecutable&>(*cpu_executable)
-          .set_ir_module_string(ir_module_string);
-    }
-  } else {
-    VLOG(1) << "Using sequential cpu backend";
-
-    // Select an order for emitting the HLO instructions for each
-    // computation. Using this sequence enables tighter buffer liveness analysis
-    // and reduced memory usage (as compared to using DependencyHloOrdering).
-    TF_ASSIGN_OR_RETURN(
-        SequentialHloOrdering::HloModuleSequence module_sequence,
-        CreateMemoryMinimizingSequence(*module, BufferSizeBytesFunction()));
-
-    // Run buffer analysis on the HLO graph. This analysis figures out which
-    // temporary buffers are required to run the computation.
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<BufferAssignment> assignment,
-        BufferAssigner::Run(module.get(),
-                            xla::MakeUnique<SequentialHloOrdering>(
-                                module.get(), module_sequence),
-                            BufferSizeBytesFunction(), memory_alignment));
-    // BufferAssignment::ToString() includes a header, so no need for us to
-    // print one ourselves.
-    XLA_VLOG_LINES(2, assignment->ToString());
-
-    if (!xla_dump_optimized_hlo_proto_to.empty()) {
-      HloProto proto = MakeHloProto(*module, *assignment);
-      TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-          proto, xla_dump_optimized_hlo_proto_to, module->name()));
-    }
-
-    // Each computation is a single function.  Emit all embedded computations
-    // before the entry computation. The order of computations returned from
-    // GetEmbeddedComputations guarantees that a called computation occurs
-    // before a caller computation.
+  // Select an order for emitting the HLO instructions for each
+  // computation. Using this sequence enables tighter buffer liveness analysis
+  // and reduced memory usage (as compared to using DependencyHloOrdering).
+  TF_ASSIGN_OR_RETURN(
+      SequentialHloOrdering::HloModuleSequence module_sequence,
+      CreateMemoryMinimizingSequence(*module, BufferSizeBytesFunction()));
+
+  // Run buffer analysis on the HLO graph. This analysis figures out which
+  // temporary buffers are required to run the computation.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<BufferAssignment> assignment,
+      BufferAssigner::Run(
+          module.get(),
+          xla::MakeUnique<SequentialHloOrdering>(module.get(), module_sequence),
+          BufferSizeBytesFunction(), memory_alignment));
+  // BufferAssignment::ToString() includes a header, so no need for us to
+  // print one ourselves.
+  XLA_VLOG_LINES(2, assignment->ToString());
+
+  if (!xla_dump_optimized_hlo_proto_to.empty()) {
+    HloProto proto = MakeHloProto(*module, *assignment);
+    TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
+        proto, xla_dump_optimized_hlo_proto_to, module->name()));
+  }
 
-    IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
-                         std::move(instruction_to_profile_idx),
-                         std::move(computation_to_profile_idx),
-                         jit->target_machine(), jit->external_constant_pool());
+  // Each computation is a single function.  Emit all embedded computations
+  // before the entry computation. The order of computations returned from
+  // GetEmbeddedComputations guarantees that a called computation occurs
+  // before a caller computation.
 
-    for (auto embedded_computation :
-         entry_computation->MakeEmbeddedComputationsList()) {
-      if (embedded_computation->IsFusionComputation()) {
-        continue;
-      }
-      TF_RETURN_IF_ERROR(
-          ir_emitter
-              .EmitComputation(embedded_computation,
-                               embedded_computation->name(),
-                               /*is_top_level_computation=*/false,
-                               &module_sequence.at(embedded_computation))
-              .status());
-    }
-    string function_name_prefix = entry_computation->name().empty()
-                                      ? "__compute"
-                                      : entry_computation->name();
-    TF_ASSIGN_OR_RETURN(
-        llvm::Function * entry_function,
-        ir_emitter.EmitComputation(entry_computation, function_name_prefix,
-                                   /*is_top_level_computation=*/true,
-                                   &module_sequence.at(entry_computation)));
+  IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
+                       std::move(instruction_to_profile_idx),
+                       std::move(computation_to_profile_idx),
+                       jit->target_machine(), jit->external_constant_pool());
 
-    string function_name = llvm_ir::AsString(entry_function->getName());
-    string ir_module_string;
-    if (embed_ir_in_executable) {
-      ir_module_string = llvm_ir::DumpModuleToString(*llvm_module);
+  for (auto embedded_computation :
+       entry_computation->MakeEmbeddedComputationsList()) {
+    if (embedded_computation->IsFusionComputation()) {
+      continue;
     }
-    TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
+    TF_RETURN_IF_ERROR(
+        ir_emitter
+            .EmitComputation(embedded_computation, embedded_computation->name(),
+                             /*is_top_level_computation=*/false,
+                             &module_sequence.at(embedded_computation))
+            .status());
+  }
+  string function_name_prefix = entry_computation->name().empty()
+                                    ? "__compute"
+                                    : entry_computation->name();
+  TF_ASSIGN_OR_RETURN(
+      llvm::Function * entry_function,
+      ir_emitter.EmitComputation(entry_computation, function_name_prefix,
+                                 /*is_top_level_computation=*/true,
+                                 &module_sequence.at(entry_computation)));
+
+  string function_name = llvm_ir::AsString(entry_function->getName());
+  string ir_module_string;
+  if (embed_ir_in_executable) {
+    ir_module_string = llvm_ir::DumpModuleToString(*llvm_module);
+  }
+  TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
 
-    XLA_VLOG_LINES(2, "LLVM IR:\n" + llvm_ir::DumpModuleToString(*llvm_module));
+  XLA_VLOG_LINES(2, "LLVM IR:\n" + llvm_ir::DumpModuleToString(*llvm_module));
 
-    // JIT compile the LLVM IR module to in-memory machine code.
-    jit->AddModule(std::move(llvm_module));
-    cpu_executable.reset(new CpuExecutable(
-        std::move(jit), std::move(assignment), std::move(module), function_name,
-        std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map)));
+  // JIT compile the LLVM IR module to in-memory machine code.
+  jit->AddModule(std::move(llvm_module));
+  cpu_executable.reset(new CpuExecutable(
+      std::move(jit), std::move(assignment), std::move(module), function_name,
+      std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map)));
 
-    if (embed_ir_in_executable) {
-      static_cast<CpuExecutable&>(*cpu_executable)
-          .set_ir_module_string(ir_module_string);
-    }
+  if (embed_ir_in_executable) {
+    static_cast<CpuExecutable&>(*cpu_executable)
+        .set_ir_module_string(ir_module_string);
   }
 
   VLOG(1) << "Compilation finished";
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index 09f028463a..f9c51f243c 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 namespace {
 
-const char* const kXlaParallelCpuOption = "xla_cpu_parallel";
 const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size";
 const char* const kXlaDisableVectorizedReduce = "xla_disable_vectorized_reduce";
 const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
@@ -30,12 +29,6 @@ namespace xla {
 namespace cpu {
 namespace options {
 
-bool CpuParallelBackendRequested(const HloModuleConfig& config) {
-  const auto& extra_options_map =
-      config.debug_options().xla_backend_extra_options();
-  return extra_options_map.count(kXlaParallelCpuOption) > 0;
-}
-
 bool OptimizeForSizeRequested(const HloModuleConfig& config) {
   const auto& extra_options_map =
       config.debug_options().xla_backend_extra_options();
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h
index 6ba0fd2453..be62ff3cc1 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h
@@ -24,7 +24,6 @@ namespace xla {
 namespace cpu {
 namespace options {
 
-bool CpuParallelBackendRequested(const HloModuleConfig& config);
 bool OptimizeForSizeRequested(const HloModuleConfig& config);
 bool VectorizedReduceDisabled(const HloModuleConfig& config);
 tensorflow::gtl::optional<int64> LlvmIrGemvTilingFactor(
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc b/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc
deleted file mode 100644
index 662ee60923..0000000000
--- a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h"
-
-#include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h"
-#include "tensorflow/compiler/xla/service/cpu/shape_partition.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-
-namespace xla {
-namespace cpu {
-
-StatusOr<bool> ParallelizationPreparation::Run(HloModule* module) {
-  XLA_VLOG_LINES(2, "ParallelizationPreparation ENTRY");
-  XLA_VLOG_LINES(2, module->ToString());
-
-  bool changed = false;
-  TF_ASSIGN_OR_RETURN(changed, RunParallelTaskAssignment(module));
-
-  HloComputation* entry_computation = module->entry_computation();
-  std::unordered_set<HloInstruction*> outlined;
-  std::vector<HloInstruction*> instructions_to_outline;
-  for (HloInstruction* instruction :
-       entry_computation->MakeInstructionPostOrder()) {
-    // If the instruction has been outlined, it no longer exists and we must not
-    // dereference it.
-    if (outlined.count(instruction) > 0) {
-      continue;
-    }
-
-    // Skip parameters and constants, there is nothing to parallelize.
-    if (instruction->opcode() == HloOpcode::kParameter ||
-        instruction->opcode() == HloOpcode::kConstant) {
-      continue;
-    }
-
-    // Outline 'instruction' in isolation if it was assigned parallel tasks.
-    if (OutlineParallelizableInstruction(instruction)) {
-      outlined.insert(instruction);
-      changed = true;
-      continue;
-    }
-
-    instructions_to_outline.clear();
-    HloInstruction* outline_candidate = instruction;
-    instructions_to_outline.push_back(outline_candidate);
-
-    // Outline sole users with the current instruction.
-    while (CanOutlineWithUser(outline_candidate)) {
-      HloInstruction* prior_candidate = outline_candidate;
-      outline_candidate = *outline_candidate->users().begin();
-      if (std::any_of(outline_candidate->operands().begin(),
-                      outline_candidate->operands().end(),
-                      [&](const HloInstruction* operand) {
-                        // Do not consider any candidates which have operands
-                        // other than the prior candidate, constants or
-                        // parameters. Otherwise, we'd increase the fan-in which
-                        // would reduce parallelism.
-                        return operand->opcode() != HloOpcode::kParameter &&
-                               operand->opcode() != HloOpcode::kConstant &&
-                               operand != prior_candidate;
-                      })) {
-        break;
-      }
-      instructions_to_outline.push_back(outline_candidate);
-    }
-
-    outlined.insert(instructions_to_outline.begin(),
-                    instructions_to_outline.end());
-
-    // Optimization to avoid replacing a single existing kCall with another
-    // kCall that just calls the first one.
-    if (instructions_to_outline.size() == 1 &&
-        instructions_to_outline[0]->opcode() == HloOpcode::kCall) {
-      continue;
-    }
-
-    module->OutlineExpressionFromComputation(
-        instructions_to_outline,
-        tensorflow::strings::StrCat("pp_", instruction->name()),
-        entry_computation);
-    changed = true;
-  }
-
-  XLA_VLOG_LINES(2, "ParallelizationPreparation EXIT");
-  XLA_VLOG_LINES(2, module->ToString());
-  return changed;
-}
-
-StatusOr<bool> ParallelizationPreparation::RunParallelTaskAssignment(
-    HloModule* module) {
-  VLOG(1) << "RunParallelTaskAssignment max_parallelism_: " << max_parallelism_;
-  bool changed = false;
-  // Initialize ParallelTaskAssignment.
-  ParallelTaskAssignment parallel_task_assignment(max_parallelism_, shape_size_,
-                                                  module);
-  // Assign parallel tasks to HLOs in entry computation.
-  HloComputation* computation = module->entry_computation();
-  for (auto* instruction : computation->instructions()) {
-    // Calculate target parallel task count in [1, max_parallelism_].
-    const int64 target_parallel_task_count =
-        parallel_task_assignment.GetTargetParallelTaskCount(instruction);
-    if (target_parallel_task_count == 1) {
-      continue;
-    }
-
-    // Assign feasible dimension partitions (based on actual dimension sizes).
-    auto dim_partition_counts = ShapePartitionAssigner(instruction->shape())
-                                    .Run(target_parallel_task_count);
-    const int64 total_partition_count =
-        ShapePartitionAssigner::GetTotalPartitionCount(dim_partition_counts);
-    if (total_partition_count <= 1) {
-      // Feasible partition calculation resulting in no partitioning, so skip.
-      continue;
-    }
-    VLOG(2) << "Assigning parallel task count: " << total_partition_count
-            << " to instruction: " << instruction->name();
-    // Map 'instruction' to assigned dimension partitioning.
-    instruction->set_outer_dimension_partitions(dim_partition_counts);
-  }
-
-  return changed;
-}
-
-bool ParallelizationPreparation::OutlineParallelizableInstruction(
-    HloInstruction* instruction) {
-  if (instruction->outer_dimension_partitions().empty()) {
-    return false;
-  }
-  // Store dimension partition counts before outlining (which clones
-  // 'instruction').
-  std::vector<int64> dim_partition_counts =
-      instruction->outer_dimension_partitions();
-  // Outline 'instruction' in its own sub-computation.
-  HloModule* module = instruction->parent()->parent();
-  auto* call = module->OutlineExpressionFromComputation(
-      {instruction}, tensorflow::strings::StrCat("pp_", instruction->name()),
-      module->entry_computation());
-  // Map previously assigned 'dim_partition_counts' to cloned root instruction.
-  VLOG(1) << "Outlining parallelizable"
-          << " caller: " << call->name()
-          << " callee: " << call->to_apply()->root_instruction()->name();
-  call->to_apply()->root_instruction()->set_outer_dimension_partitions(
-      dim_partition_counts);
-  return true;
-}
-
-bool ParallelizationPreparation::CanOutlineWithUser(
-    HloInstruction* instruction) {
-  if (instruction->users().size() != 1) {
-    // Do not outline 'instruction' with multiple users.
-    return false;
-  }
-  if (AssignedParallelTasks(instruction) ||
-      AssignedParallelTasks(*instruction->users().begin())) {
-    // Do not outline if 'instruction' (or user) were assigned parallel tasks.
-    return false;
-  }
-  return true;
-}
-
-bool ParallelizationPreparation::AssignedParallelTasks(
-    HloInstruction* instruction) {
-  return !instruction->outer_dimension_partitions().empty() ||
-         (instruction->opcode() == HloOpcode::kCall &&
-          !instruction->to_apply()
-               ->root_instruction()
-               ->outer_dimension_partitions()
-               .empty());
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h b/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h
deleted file mode 100644
index 87be758ef5..0000000000
--- a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_PARALLELIZATION_PREPARATION_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_PARALLELIZATION_PREPARATION_H_
-
-#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
-
-namespace xla {
-namespace cpu {
-
-// This pass prepares an HLO module for parallel execution by transforming
-// subgraphs of the top-level computation into embedded computations which can
-// be executed in parallel.
-// TODO(b/29630486): Currently, it is limited to turning all instructions (which
-// are not constants or parameters) in the entry computation into embedded
-// computations.  However, it could make sense to coarsen the parallelization to
-// improve cache locality.  Also, we will need to do something to intelligently
-// handle While constructs.
-class ParallelizationPreparation : public HloPassInterface {
- public:
-  // 'max_parallelism': the maximum parallel task count per instruction.
-  // 'shape_size': shape size function used by HloCostAnalysis during parallel
-  //               task assignment.
-  ParallelizationPreparation(
-      const int64 max_parallelism,
-      const HloCostAnalysis::ShapeSizeFunction& shape_size)
-      : max_parallelism_(max_parallelism), shape_size_(shape_size) {}
-  ~ParallelizationPreparation() override {}
-
-  tensorflow::StringPiece name() const override {
-    return "cpu-parallel-prepare";
-  }
-
-  // Run parallel preparation on the given computation. Returns whether the
-  // computation was changed.
-  StatusOr<bool> Run(HloModule* module) override;
-
- private:
-  // Assigns parallel task partitions to conformant instructions in 'module'.
-  // Returns true on success or error status otherwise.
-  StatusOr<bool> RunParallelTaskAssignment(HloModule* module);
-
-  // Outlines 'instruction' from entry computation, if it had
-  // been assigned parallel tasks in an earlier pass through the computation.
-  // Returns true if 'instruction' was successfully outlined, false otherwise.
-  bool OutlineParallelizableInstruction(HloInstruction* instruction);
-
-  // Returns true if 'instruction' can be outlined into the same sub-computation
-  // with its single user (parallelizable instructions are not outlined with
-  // each other). Returns false otherwise.
-  bool CanOutlineWithUser(HloInstruction* instruction);
-
-  // Returns true if 'instruction' (or the root of the sub-computation that
-  // 'instruction' calls) has had parallel tasks assigned in earlier pass.
-  // Returns false otherwise.
-  bool AssignedParallelTasks(HloInstruction* instruction);
-
-  const int64 max_parallelism_;
-  const HloCostAnalysis::ShapeSizeFunction shape_size_;
-};
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_PARALLELIZATION_PREPARATION_H_
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index f990ee2785..0b08ad8da3 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -93,8 +93,6 @@ IrEmitter::IrEmitter(
       computation_to_profile_idx_(std::move(computation_to_profile_idx)),
       alias_analysis_(hlo_module, assignment, &llvm_module->getContext()),
       hlo_module_config_(hlo_module.config()),
-      parallel_cpu_backend_(
-          options::CpuParallelBackendRequested(hlo_module_config_)),
       is_top_level_computation_(false),
       target_machine_features_(target_machine),
       external_constant_pool_(external_constant_pool) {
@@ -2163,8 +2161,7 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
 
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(call));
 
-  if (!computation->root_instruction()->outer_dimension_partitions().empty() &&
-      !parallel_cpu_backend_) {
+  if (!computation->root_instruction()->outer_dimension_partitions().empty()) {
     // ParallelTaskAssignment assigned partitions, emit call to
     // ParallelForkJoin.
     std::vector<llvm::Value*> call_args = GetArrayFunctionCallArguments(
@@ -2550,22 +2547,6 @@ Status IrEmitter::FinishVisit(HloInstruction* root) {
     }
   };
 
-  // For the parallel cpu backend, we record the total for each embedded
-  // computation callee with its caller kCall HLO.
-  if (parallel_cpu_backend_ && is_top_level_computation_) {
-    auto* computation = root->parent();
-    auto* entry_computation = computation->parent()->entry_computation();
-    if (computation != entry_computation) {
-      for (HloInstruction* instruction : entry_computation->instructions()) {
-        if (instruction->opcode() == HloOpcode::kCall &&
-            instruction->to_apply()->root_instruction() == root) {
-          record_complete_computation(GetProfileCounterFor(*instruction));
-          return Status::OK();
-        }
-      }
-    }
-  }
-
   // For the entry computation this increment is cumulative of embedded
   // computations since it includes cycles spent in computations invoked by
   // While, Call etc.
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 5094402514..0f2f3d1817 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -532,8 +532,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
 
   const HloModuleConfig& hlo_module_config_;
 
-  const bool parallel_cpu_backend_;
-
   bool is_top_level_computation_;
 
   TargetMachineFeatures target_machine_features_;
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
deleted file mode 100644
index 035f9ddb2e..0000000000
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ /dev/null
@@ -1,528 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h"
-
-#include <stdint.h>
-#include <algorithm>
-#include <deque>
-#include <iterator>
-#include <list>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
-#include "tensorflow/compiler/xla/service/cpu/shape_partition.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/logical_buffer.h"
-#include "tensorflow/compiler/xla/service/shaped_buffer.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/mem.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-namespace cpu {
-
-ParallelCpuExecutable::ParallelCpuExecutable(
-    std::unique_ptr<SimpleOrcJIT> jit,
-    std::unique_ptr<const BufferAssignment> assignment,
-    std::unique_ptr<const HloModule> hlo_module,
-    std::unique_ptr<const HloInstructionMap<string>> function_names,
-    std::unordered_map<const HloInstruction*, std::unique_ptr<unsigned char[]>>
-        aligned_constants,
-    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
-    : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
-                 std::move(hlo_profile_index_map)),
-      jit_(std::move(jit)),
-      assignment_(std::move(assignment)),
-      function_names_(std::move(function_names)),
-      aligned_constants_(std::move(aligned_constants)) {}
-
-// Type of the computation function we expect in the JIT.
-using ComputeFunctionType = void (*)(void*, const void*, const void**, void**,
-                                     int64*, int64*);
-
-// Given a pointer to an output buffer (following the CPU JIT calling
-// conventions), mark addresses that are "live". The initial pointer itself is
-// trivially live. If the shape of the buffer is a tuple, this analysis looks
-// into the tuple's elements and marks them live as well (since tuples keep
-// pointers to buffers) and also works recursively.
-// address is an in-memory buffer address that contains some runtime XLA object.
-// shape is its shape. marked_addresses is the set of live addresses to
-// populate.
-static void MarkLiveAddressesInOutput(
-    const void* address, const Shape& shape,
-    std::unordered_set<const void*>* marked_addresses) {
-  marked_addresses->insert(address);
-  const uintptr_t* address_buffer = static_cast<const uintptr_t*>(address);
-  if (ShapeUtil::IsTuple(shape)) {
-    for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
-      const uintptr_t* element_address = address_buffer + i;
-      const void* element = reinterpret_cast<const void*>(*element_address);
-      MarkLiveAddressesInOutput(
-          element, ShapeUtil::GetTupleElementShape(shape, i), marked_addresses);
-    }
-  }
-}
-
-namespace {
-
-// Executor manages the concurrent execution of 'functions' for instructions
-// in 'pending' on 'thread_pool' (storing resulting data in 'results').
-class Executor {
- public:
-  Executor(const HloInstructionMap<ComputeFunctionType>& functions,
-           const ServiceExecutableRunOptions* run_options,
-           std::list<HloInstruction*>* pending,
-           HloInstructionMap<const void*>* results, void** temps_array,
-           int64* profile_counters_array, const BufferAssignment* assignment)
-      : functions_(functions),
-        run_options_(run_options),
-        pending_(pending),
-        results_(results),
-        temps_array_(temps_array),
-        profile_counters_array_(profile_counters_array),
-        thread_pool_(CHECK_NOTNULL(run_options_->xla_intra_op_thread_pool())),
-        assignment_(assignment) {}
-
-  // Executes pending list of instructions on thread pool.
-  // Returns OK status on success, error status otherwise.
-  Status Run();
-
- private:
-  // Schedules a parallel invocation of compute function for 'instruction' on
-  // 'thread_pool_', storing result in 'result_buffer'.
-  // If 'partition_buffers' is non-null, parallel task will be invoked on
-  // per-dimension partition [start, limit) values stored in
-  // 'partition_buffers'.
-  void Schedule(HloInstruction* instruction, int64* partition_buffers,
-                void* result_buffer);
-
-  // Returns true if 'instruction' has been assigned parallel tasks (returns
-  // false otherwise).
-  bool HasParallelTasks(HloInstruction* instruction);
-
-  // Returns in 'partition_buffers' the partition [size, limit) for each
-  // dimension.
-  int64* GetPartitionBuffers(
-      const std::vector<std::pair<int64, int64>>& partition);
-
-  // Returns array of result buffers for all operands in 'instruction'.
-  const void** GetOperandBuffers(HloInstruction* instruction);
-
-  // Arguments passed into Executor.
-  const HloInstructionMap<ComputeFunctionType>& functions_;
-  const ServiceExecutableRunOptions* run_options_;
-  std::list<HloInstruction*>* pending_;
-  HloInstructionMap<const void*>* results_;
-  void** temps_array_;
-  int64* profile_counters_array_;
-  tensorflow::thread::ThreadPool* thread_pool_;
-  const BufferAssignment* assignment_;
-
-  // Members used to manage instruction execution.
-  tensorflow::mutex completion_queue_lock_;
-  tensorflow::condition_variable completion_queue_cv_;
-  std::deque<HloInstruction*> completion_queue_;
-  int64 instructions_in_flight_ = 0;
-  std::unordered_map<const HloInstruction*, int64> tasks_in_flight_;
-};
-
-Status Executor::Run() {
-  while (!pending_->empty() || instructions_in_flight_ > 0) {
-    auto pending_it = pending_->begin();
-    while (pending_it != pending_->end()) {
-      HloInstruction* instruction = *pending_it;
-      // Skip pending instructions whose operands aren't ready.
-      if (std::any_of(instruction->operands().begin(),
-                      instruction->operands().end(),
-                      [&](HloInstruction* operand) {
-                        return !ContainsKey(*results_, operand);
-                      })) {
-        ++pending_it;
-        continue;
-      }
-
-      // Get 'result_buffer' reference to result buffer for 'instruction'.
-      TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
-                          assignment_->GetUniqueTopLevelSlice(instruction));
-      void* result_buffer =
-          static_cast<char*>(temps_array_[result_slice.index()]) +
-          result_slice.offset();
-
-      if (HasParallelTasks(instruction)) {
-        // 'instruction' has been assigned parallel task partitions.
-        CHECK_EQ(HloOpcode::kCall, instruction->opcode());
-        HloInstruction* root = instruction->to_apply()->root_instruction();
-
-        // Create ShapePartitionIterator to iterate through all outer dimension
-        // partitions of 'instruction'.
-        ShapePartitionIterator partition_iterator(
-            root->shape(), root->outer_dimension_partitions());
-
-        const int64 partition_count =
-            partition_iterator.GetTotalPartitionCount();
-
-        // Record total parallel task count for 'instruction' before dispatch.
-        {
-          tensorflow::mutex_lock l(completion_queue_lock_);
-          tasks_in_flight_.insert(std::make_pair(instruction, partition_count));
-          VLOG(2) << "Schedule PARALLEL"
-                  << " instruction: " << instruction->name()
-                  << " instruction.callee: "
-                  << instruction->to_apply()->root_instruction()->name()
-                  << " partition_count: " << partition_count;
-        }
-
-        for (int64 i = 0; i < partition_count; ++i) {
-          // Get partition [start, limit) for each dimension.
-          auto partition_buffers =
-              GetPartitionBuffers(partition_iterator.GetPartition(i));
-          Schedule(instruction, partition_buffers, result_buffer);
-        }
-
-      } else {
-        // Set tasks in-flight to '1' for sequential instruction execution.
-        {
-          tensorflow::mutex_lock l(completion_queue_lock_);
-          tasks_in_flight_.insert(std::make_pair(instruction, 1));
-          VLOG(2) << "Schedule SEQUENTIAL"
-                  << " instruction: " << instruction->name()
-                  << " instruction.callee: "
-                  << instruction->to_apply()->root_instruction()->name();
-        }
-        Schedule(instruction, nullptr, result_buffer);
-      }
-
-      ++instructions_in_flight_;
-      pending_it = pending_->erase(pending_it);
-    }
-    // Wait for a completed HLO instruction to be present in the queue.  We will
-    // pop it out of the queue and make the result available to its users.
-    HloInstruction* instruction;
-    do {
-      tensorflow::mutex_lock l(completion_queue_lock_);
-      if (completion_queue_.empty()) {
-        completion_queue_cv_.wait(l);
-      }
-      if (!completion_queue_.empty()) {
-        instruction = completion_queue_.front();
-        completion_queue_.pop_front();
-        break;
-      }
-    } while (true);
-    TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
-                        assignment_->GetUniqueTopLevelSlice(instruction));
-    void* result_buffer =
-        static_cast<char*>(temps_array_[result_slice.index()]) +
-        result_slice.offset();
-    InsertOrDie(results_, instruction, result_buffer);
-    --instructions_in_flight_;
-  }
-  return Status::OK();
-}
-
-void Executor::Schedule(HloInstruction* instruction, int64* partition_buffers,
-                        void* result_buffer) {
-  // The thread pool entry takes ownership of |operand_buffers|.
-  auto operand_buffers = GetOperandBuffers(instruction);
-
-  auto function = FindOrDie(functions_, instruction);
-  const auto* exec_run_options = &run_options_->run_options();
-  thread_pool_->Schedule([this, instruction, result_buffer, operand_buffers,
-                          partition_buffers, exec_run_options, function]() {
-    function(result_buffer, exec_run_options, operand_buffers, temps_array_,
-             partition_buffers, profile_counters_array_);
-
-    delete[] operand_buffers;
-    delete[] partition_buffers;
-    // Push the completed HLO instruction on the queue, the main
-    // thread will pop it off and potentially launch more work which
-    // uses the result.
-    // TODO(b/27458679) Consider alternative task scheduling and synchronization
-    // schemes. For example, we could avoid the overhead associate with the
-    // condvar here if the thread just dequed the next instruction to execute
-    // on completion.
-    {
-      tensorflow::mutex_lock l(completion_queue_lock_);
-      // Decrement in-flight task count for this completion.
-      if (--FindOrDie(tasks_in_flight_, instruction) == 0) {
-        completion_queue_.push_back(instruction);
-        completion_queue_cv_.notify_all();
-        tasks_in_flight_.erase(instruction);
-      }
-    }
-  });
-}
-
-int64* Executor::GetPartitionBuffers(
-    const std::vector<std::pair<int64, int64>>& partition) {
-  // Return in 'partition_buffers' partition [size, limit) for each dimension.
-  auto partition_buffers = new int64[partition.size() * 2];
-  for (int i = 0; i < partition.size(); ++i) {
-    partition_buffers[2 * i + 0] = partition[i].first;
-    partition_buffers[2 * i + 1] = partition[i].first + partition[i].second;
-  }
-  return partition_buffers;
-}
-
-bool Executor::HasParallelTasks(HloInstruction* instruction) {
-  return instruction->opcode() == HloOpcode::kCall &&
-         !instruction->to_apply()
-              ->root_instruction()
-              ->outer_dimension_partitions()
-              .empty();
-}
-
-const void** Executor::GetOperandBuffers(HloInstruction* instruction) {
-  // We cannot use a move-only RAII type like std::unique_ptr because the
-  // list of operands is allocated on the main thread and transferred to the
-  // worker via the lambda passed to enqueue_function.  In order for the
-  // lambda to take ownership, we would need to use generalized lambda
-  // capture which is a feature new to C++14.
-  // TODO(b/27458679) Avoid dynamic allocations in Executor.
-  auto operand_buffers = new const void*[instruction->operand_count()];
-  std::transform(instruction->operands().begin(), instruction->operands().end(),
-                 operand_buffers, [this](HloInstruction* operand) {
-                   return FindOrDie(*results_, operand);
-                 });
-  return operand_buffers;
-}
-
-}  // namespace
-
-Status ParallelCpuExecutable::AllocateBuffers(
-    DeviceMemoryAllocator* memory_allocator, int device_ordinal,
-    std::vector<se::DeviceMemoryBase>* buffers) {
-  CHECK_EQ(buffers->size(), assignment_->Allocations().size());
-  VLOG(3) << "Allocating " << assignment_->Allocations().size()
-          << " allocations for module " << module().name();
-  for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
-       ++i) {
-    auto& allocation = assignment_->GetAllocation(i);
-
-    VLOG(3) << allocation.ToString();
-
-    if (allocation.is_entry_computation_parameter()) {
-      VLOG(3) << "allocation #" << i << " is a parameter";
-      continue;
-    }
-
-    if (allocation.is_thread_local()) {
-      VLOG(3) << "buffer #" << i << " is thread-local";
-      continue;
-    }
-
-    int64 buffer_size = allocation.size();
-    if (!(*buffers)[i].is_null()) {
-      VLOG(3) << "buffer #" << i
-              << " is in the preallocated result ShapedBuffer";
-    } else {
-      TF_ASSIGN_OR_RETURN((*buffers)[i], memory_allocator->Allocate(
-                                             device_ordinal, buffer_size));
-
-      VLOG(3) << "buffer #" << i << " allocated " << buffer_size << " bytes ["
-              << (*buffers)[i].opaque() << "]";
-    }
-
-    // Since the output buffer and all the temporary buffers were written into
-    // by the JITed code, msan has no way of knowing their memory was
-    // initialized. Mark them initialized so that msan doesn't flag loads from
-    // these buffers.
-    TF_ANNOTATE_MEMORY_IS_INITIALIZED((*buffers)[i].opaque(), buffer_size);
-  }
-
-  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
-                      assignment_->GetUniqueTopLevelOutputSlice());
-  VLOG(3) << "result index: " << result_slice.index();
-
-  return Status::OK();
-}
-
-Status ParallelCpuExecutable::ExecuteComputeFunctions(
-    const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
-    HloExecutionProfile* hlo_execution_profile) {
-  // Allocate profiling counters for each hlo instruction that we would like to
-  // profile.
-  std::vector<int64>* profile_counters = nullptr;
-  if (hlo_execution_profile) {
-    profile_counters = hlo_execution_profile->mutable_profile_counters();
-  }
-
-  std::vector<void*> buffer_pointers;
-  buffer_pointers.reserve(buffers.size());
-  for (auto device_allocation : buffers) {
-    buffer_pointers.push_back(device_allocation.opaque());
-  }
-
-  // Resolve functions for all the HLO instructions ahead of time.
-  HloInstructionMap<ComputeFunctionType> functions;
-  for (auto& entry : *function_names_) {
-    tensorflow::mutex_lock lock(jit_mutex_);
-    HloInstruction* instruction = entry.first;
-    llvm::JITSymbol sym = jit_->FindCompiledSymbol(entry.second);
-    TF_RET_CHECK(sym);
-    InsertOrDie(
-        &functions, instruction,
-        reinterpret_cast<ComputeFunctionType>(cantFail(sym.getAddress())));
-  }
-
-  // Map containing pointers to result buffers for each instruction.
-  HloInstructionMap<const void*> results;
-
-  uint64 start_micros = tensorflow::Env::Default()->NowMicros();
-
-  std::list<HloInstruction*> pending;
-
-  // Call the function for each HLO instruction in topological order.
-  const HloComputation& entry_computation = *module().entry_computation();
-  for (auto* instruction : entry_computation.MakeInstructionPostOrder()) {
-    // Parameters and constants have no functions associated with them. Instead
-    // just copy the existing buffer into the map containing instruction
-    // results..
-    if (instruction->opcode() == HloOpcode::kParameter) {
-      InsertOrDie(
-          &results, instruction,
-          arguments[instruction->parameter_number()]->root_buffer().opaque());
-    } else if (instruction->opcode() == HloOpcode::kConstant) {
-      unsigned char* aligned_data =
-          FindOrDie(aligned_constants_, instruction).get();
-      InsertOrDie(&results, instruction, aligned_data);
-    } else {
-      TF_RET_CHECK(instruction->opcode() == HloOpcode::kCall);
-      pending.push_back(instruction);
-    }
-  }
-
-  // TODO(b/27458679) Manage scheduling based on in-flight concurrency limits.
-  // For example, if we expect a library conv/matmul call to run at max
-  // concurrency, we should not dispatch runnable instructions until the
-  // library call is finished (to avoid expensive cache invalidation).
-  Executor executor(
-      functions, run_options, &pending, &results, buffer_pointers.data(),
-      profile_counters ? profile_counters->data() : nullptr, assignment_.get());
-
-  TF_RETURN_IF_ERROR(executor.Run());
-
-  uint64 end_micros = tensorflow::Env::Default()->NowMicros();
-
-  {
-    tensorflow::mutex_lock lock(mutex_);
-    double nanoseconds = (end_micros - start_micros) * 1000.0;
-    execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
-  }
-
-  return Status::OK();
-}
-
-StatusOr<ScopedShapedBuffer> ParallelCpuExecutable::ExecuteOnStream(
-    const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
-  if (GetRootPointsToSet().IsAmbiguous()) {
-    return Unimplemented("Points-to set of root instruction is ambiguous");
-  }
-
-  se::Stream* stream = run_options->stream();
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
-  std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
-
-  ScopedShapedBuffer result_buffer(
-      /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(),
-      run_options->allocator(), stream->parent()->device_ordinal());
-
-  TF_RETURN_IF_ERROR(AllocateBuffers(
-      memory_allocator, stream->parent()->device_ordinal(), &buffers));
-
-  TF_RETURN_IF_ERROR(ExecuteComputeFunctions(run_options, arguments, buffers,
-                                             hlo_execution_profile));
-
-  // Copy DeviceMemoryBase values which into the respective location in
-  // the ScopedShapedBuffer which is returned to the caller.
-  std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
-  TF_RETURN_IF_ERROR(result_buffer.buffers().ForEachMutableElementWithStatus(
-      [&](const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
-        const auto& sources = this->GetRootPointsToSet().element(index);
-
-        // The points to set is unambiguous so the set should be a singleton.
-        CHECK_EQ(1, sources.size());
-        const LogicalBuffer* buffer_source = sources[0];
-        HloInstruction* src = buffer_source->instruction();
-
-        // The source for this result buffer can be a nested buffer such as a
-        // tuple element. The source instruction should have a non-parameter
-        // buffer assigned.
-        TF_ASSIGN_OR_RETURN(
-            const BufferAllocation::Slice slice,
-            this->assignment_->GetUniqueSlice(src, buffer_source->index()));
-        CHECK(!slice.allocation()->is_entry_computation_parameter());
-
-        const BufferAllocation::Index buffer_index = slice.index();
-        const se::DeviceMemoryBase& buffer = buffers[buffer_index];
-        CHECK(!buffer.is_null() || buffer.size() == 0);
-        *device_memory = buffer;
-        buffers_in_result[buffer_index] = true;
-        return Status::OK();
-      }));
-
-  // Free all buffers not in the result.
-  for (size_t i = 0; i < buffers.size(); ++i) {
-    se::DeviceMemoryBase alloc = buffers[i];
-    if (!buffers_in_result[i] && !alloc.is_null()) {
-      VLOG(3) << "CpuExecutable deallocating buffer #" << i << " ["
-              << alloc.opaque() << "]";
-      TF_RETURN_IF_ERROR(memory_allocator->Deallocate(
-          stream->parent()->device_ordinal(), &alloc));
-    }
-  }
-
-  return std::move(result_buffer);
-}
-
-StatusOr<ScopedShapedBuffer> ParallelCpuExecutable::ExecuteAsyncOnStream(
-    const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
-  // TODO(b/30671675): Implement asynchronous execution mode.
-  return Unimplemented(
-      "Asynchronous execution on stream is not yet supported on CPU.");
-}
-
-const PointsToSet& ParallelCpuExecutable::GetRootPointsToSet() const {
-  return assignment_->points_to_analysis().GetPointsToSet(
-      module().entry_computation()->root_instruction());
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
deleted file mode 100644
index 55f8331b59..0000000000
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_CPU_EXECUTABLE_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_CPU_EXECUTABLE_H_
-
-#include <stddef.h>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/executable.h"
-#include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/shaped_buffer.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-
-namespace xla {
-namespace cpu {
-
-// CPU-targeting parallel implementation of the XLA Executable interface.
-//
-// Wraps a JIT-ed object that can be executed "on device". We JIT for the host
-// architecture, so JIT-ed code and host code share the same ABI.
-class ParallelCpuExecutable : public Executable {
- public:
-  ParallelCpuExecutable(
-      std::unique_ptr<SimpleOrcJIT> jit,
-      std::unique_ptr<const BufferAssignment> assignment,
-      std::unique_ptr<const HloModule> hlo_module,
-      std::unique_ptr<const HloInstructionMap<string>> function_names,
-      std::unordered_map<const HloInstruction*,
-                         std::unique_ptr<unsigned char[]>>
-          aligned_constants,
-      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
-  ~ParallelCpuExecutable() override {}
-
-  StatusOr<ScopedShapedBuffer> ExecuteOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
-
-  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
-
-  // This should be called after set_ir_module_string.
-  const string& ir_module_string() const { return ir_module_string_; }
-
-  void set_ir_module_string(const string& ir_module_string) {
-    ir_module_string_ = ir_module_string;
-  }
-
-  static int64 ShapeSizeBytes(const Shape& shape) {
-    // On the cpu, opaques are pointers.
-    if (ShapeUtil::IsOpaque(shape)) {
-      return sizeof(void*);
-    }
-    return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
-  }
-
- private:
-  // Allocate buffers required for execution and assign them to the elements of
-  // "buffers". "buffers" should be sized to the number of buffers in buffer
-  // assignment. Each vector element corresponds to a particular Index. If
-  // a vector element already contains a non-null DeviceMemoryBase, then no
-  // buffer is assigned for this element.
-  Status AllocateBuffers(DeviceMemoryAllocator* memory_allocator,
-                         int device_ordinal,
-                         std::vector<se::DeviceMemoryBase>* buffers);
-
-  // Calls the generated functions in 'function_names_', performing the
-  // computation with the given arguments using the supplied buffers.
-  Status ExecuteComputeFunctions(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
-      HloExecutionProfile* hlo_execution_profile);
-
-  // Returns the points-to set of the root instruction of the entry
-  // computation. Uses points-to analysis from buffer assignment.
-  const PointsToSet& GetRootPointsToSet() const;
-
-  // The JIT containing compiled modules.
-  tensorflow::mutex jit_mutex_;
-  const std::unique_ptr<SimpleOrcJIT> jit_ GUARDED_BY(jit_mutex_);
-
-  // Buffer assignment for the buffers we need to allocate.
-  const std::unique_ptr<const BufferAssignment> assignment_;
-
-  // The LLVM IR, in string format, of the unoptimized module generated for this
-  // ParallelCpuExecutable. We save a string instead of an llvm::Module* because
-  // leaving llvm::Module* in a singleton can cause the heap checker to emit
-  // false positives.
-  string ir_module_string_;
-
-  // Map containing the JITted function names for each HLO instruction.
-  const std::unique_ptr<const HloInstructionMap<string>> function_names_;
-
-  // Map from HLO Constant instructions to a pointer to their literal data.
-  // The data stored in the protocol buffer might be insufficiently aligned,
-  // we create a sufficiently aligned copy and store it in this map.
-  const std::unordered_map<const HloInstruction*,
-                           std::unique_ptr<unsigned char[]>>
-      aligned_constants_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ParallelCpuExecutable);
-};
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_CPU_EXECUTABLE_H_
-- 
GitLab


From b75655fc3be17d3b35d28cd649ab0dc14d00009c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 14:59:57 -0700
Subject: [PATCH 433/450] Disable factorization_ops_test on ASAN due to flaky
 timeouts.

PiperOrigin-RevId: 194303178
---
 tensorflow/contrib/factorization/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index 0a648d5d40..f28d95401c 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -215,6 +215,7 @@ tf_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:sparse_tensor",
     ],
+    tags = ["noasan"],  # times out b/78588193
 )
 
 # Estimators tests
-- 
GitLab


From 48913102f90f4945f9bb26cc59e302014ec0d7e2 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Wed, 25 Apr 2018 15:21:16 -0700
Subject: [PATCH 434/450] Refactoring: Remove a redundant map from model.h.
 PiperOrigin-RevId: 194306629

---
 tensorflow/contrib/lite/kernels/test_util.h | 4 +++-
 tensorflow/contrib/lite/model.cc            | 6 +-----
 tensorflow/contrib/lite/model.h             | 1 -
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/test_util.h b/tensorflow/contrib/lite/kernels/test_util.h
index a9064d54e7..a5f345e98a 100644
--- a/tensorflow/contrib/lite/kernels/test_util.h
+++ b/tensorflow/contrib/lite/kernels/test_util.h
@@ -88,7 +88,9 @@ struct TensorData {
 class SingleOpResolver : public OpResolver {
  public:
   SingleOpResolver(const BuiltinOperator op, TfLiteRegistration* registration)
-      : op_(op), registration_(registration) {}
+      : op_(op), registration_(registration) {
+    registration_->builtin_code = op;
+  }
   TfLiteRegistration* FindOp(BuiltinOperator op) const override {
     if (op == op_) {
       return registration_;
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 2dd6d67e07..f45f39d1e6 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -194,7 +194,6 @@ TfLiteStatus InterpreterBuilder::BuildLocalIndexToRegistrationMapping() {
           builtin_code);
       status = kTfLiteError;
     } else if (builtin_code != BuiltinOperator_CUSTOM) {
-      flatbuffer_op_index_to_registration_types_.push_back(builtin_code);
       registration = op_resolver_.FindOp(builtin_code);
       if (registration == nullptr) {
         error_reporter_->Report("Didn't find op for builtin opcode '%s'\n",
@@ -208,8 +207,6 @@ TfLiteStatus InterpreterBuilder::BuildLocalIndexToRegistrationMapping() {
     } else {
       const char* name = opcode->custom_code()->c_str();
       registration = op_resolver_.FindOp(name);
-      flatbuffer_op_index_to_registration_types_.push_back(
-          BuiltinOperator_CUSTOM);
       if (registration == nullptr) {
         error_reporter_->Report("Didn't find custom op for name '%s'\n", name);
         status = kTfLiteError;
@@ -702,8 +699,7 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
       continue;
     }
 
-    auto op_type =
-        flatbuffer_op_index_to_registration_types_[op->opcode_index()];
+    BuiltinOperator op_type = static_cast<BuiltinOperator>(reg->builtin_code);
     if (op_type != BuiltinOperator_CUSTOM && op->custom_options()) {
       error_reporter_->Report(
           "Found builtin operator %s with custom options.\n",
diff --git a/tensorflow/contrib/lite/model.h b/tensorflow/contrib/lite/model.h
index 5a55b031a8..a7d7f3ea10 100644
--- a/tensorflow/contrib/lite/model.h
+++ b/tensorflow/contrib/lite/model.h
@@ -188,7 +188,6 @@ class InterpreterBuilder {
   ErrorReporter* error_reporter_;
 
   std::vector<TfLiteRegistration*> flatbuffer_op_index_to_registration_;
-  std::vector<BuiltinOperator> flatbuffer_op_index_to_registration_types_;
   const Allocation* allocation_ = nullptr;
 };
 
-- 
GitLab


From eb29d1e4f3b05a8c521b6b2fba58c7e54e2885ef Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 15:25:15 -0700
Subject: [PATCH 435/450] Automated g4 rollback of changelist 194261487

PiperOrigin-RevId: 194307293
---
 tensorflow/core/grappler/op_types.cc          | 16 +++++------
 .../optimizers/arithmetic_optimizer.cc        | 27 ++++++-------------
 .../optimizers/arithmetic_optimizer.h         |  5 +++-
 3 files changed, 18 insertions(+), 30 deletions(-)

diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index fba6c5810d..f595cf6456 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -506,13 +506,10 @@ bool IsUnaryElementWise(const NodeDef& node) {
           "Atan2",
           "Atanh",
           "Ceil",
-          "CheckNumerics",
           "ComplexAbs",
           "Conj",
           "Cos",
           "Cosh",
-          "DebugGradientIdentity",
-          "DeepCopy"
           "Digamma",
           "Elu"
           "Erf",
@@ -520,37 +517,36 @@ bool IsUnaryElementWise(const NodeDef& node) {
           "Exp",
           "Expm1",
           "Floor",
-          "Identity",
           "Inv",
           "Invert",
+          "Isinf",
+          "Isnan",
+          "Isfinite",
           "Lgamma",
           "Log",
           "Log1p",
           "LogicalNot",
           "Neg",
-          "PreventGradient",
-          "Print",
           "Reciprocal",
           "Relu",
           "Relu6",
           "Rint",
           "Round",
-          "Rsqrt",
           "Selu",
+          "Rsqrt",
           "Sigmoid",
           "Sign",
           "Sin",
           "SinH",
-          "Snapshot",
           "Softplus",
           "Softsign",
           "Sqrt",
           "Square",
-          "StopGradient",
           "Tan"
           "Tanh",
       }));
-  return element_wise_ops->find(node.op()) != element_wise_ops->end();
+  return element_wise_ops->count(node.op()) > 0 ||
+         (!IsIdentityN(node) && IsValueAndOrderPreserving(node));
 }
 
 bool HasOpDef(const NodeDef& node) {
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 65b3bad64d..866b993e93 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -303,11 +303,6 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
     }
   }
 
-  bool IsInPreserveSet(const NodeDef& node) const {
-    return ctx().nodes_to_preserve->find(node.name()) !=
-           ctx().nodes_to_preserve->end();
-  }
-
  private:
   // Extended context required for ArithmeticOptimizer.
   const ArithmeticOptimizerContext ctx_ext_;
@@ -480,6 +475,11 @@ class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
     return group.root_node->device() == node.device();
   }
 
+  bool IsInPreserveSet(const NodeDef& node) const {
+    return ctx().nodes_to_preserve->find(node.name()) !=
+           ctx().nodes_to_preserve->end();
+  }
+
   bool IsAlreadyOptimized(const NodeDef& node) const {
     return optimized_nodes_.find(node.name()) != optimized_nodes_.end();
   }
@@ -1346,7 +1346,6 @@ class RemoveNegationStage : public ArithmeticOptimizerStage {
 // Exp(Sin(Concat([x, y, z]))).
 // TODO(rmlarsen): Support casting. We would have to change the type attribute
 // on the concat node.
-// TODO(rmlarsen): Handle Enter/Exit.
 class HoistCWiseUnaryFromConcatStage : public ArithmeticOptimizerStage {
  public:
   explicit HoistCWiseUnaryFromConcatStage(
@@ -1357,9 +1356,7 @@ class HoistCWiseUnaryFromConcatStage : public ArithmeticOptimizerStage {
   ~HoistCWiseUnaryFromConcatStage() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
-    if (!IsConcat(*node) || IsInPreserveSet(*node)) {
-      return false;
-    }
+    if (!IsConcat(*node)) return false;
     const int n = node->attr().at("N").i();
     return n > 1;
   }
@@ -1371,11 +1368,6 @@ class HoistCWiseUnaryFromConcatStage : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(
         FindCommonUnaryOpPrefix(*concat_node, &prefix_length, &ctrl_inputs));
     if (prefix_length > 0) {
-      LOG(INFO) << "Found prefix of length " << prefix_length << " for node:\n"
-                << concat_node->DebugString();
-      for (auto foo : ctrl_inputs) {
-        LOG(INFO) << "ctrl_input = " << foo;
-      }
       TF_RETURN_IF_ERROR(
           HoistUnaryOpPrefix(prefix_length, &ctrl_inputs, concat_node));
       AddToOptimizationQueue(concat_node);
@@ -1421,7 +1413,6 @@ class HoistCWiseUnaryFromConcatStage : public ArithmeticOptimizerStage {
                                  std::set<string>* ctrl_inputs) const {
     *prefix_length = 0;
     const int n = concat_node.attr().at("N").i();
-    const string& concat_device = concat_node.device();
     // Follow the chains backwards from each concat input as long as all the
     // following conditions hold:
     //   1. The ops in all chains are the same.
@@ -1447,10 +1438,8 @@ class HoistCWiseUnaryFromConcatStage : public ArithmeticOptimizerStage {
       if (!IsUnaryElementWise(*tail0)) break;
       for (int chain = 0; chain < n; ++chain) {
         // TODO(rmlarsen): Allow and hoist outgoing control edges.
-        if (tail[chain]->device() != concat_device ||
-            tail[chain]->op() != tail0->op() ||
-            ctx().node_map->GetOutputs(tail[chain]->name()).size() > 1 ||
-            IsInPreserveSet(*tail[chain])) {
+        if (tail[chain]->op() != tail0->op() ||
+            ctx().node_map->GetOutputs(tail[chain]->name()).size() > 1) {
           stop = true;
           break;
         }
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index c299bd030b..375f13acc1 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -65,13 +65,16 @@ class ArithmeticOptimizer : public GraphOptimizer {
     bool remove_redundant_bitcast = true;
     bool remove_redundant_cast = true;
     bool remove_negation = true;
-    bool hoist_unary_out_of_concat = true;
+    bool hoist_unary_out_of_concat = false;
 
     // Choose which arithmetic optimizer stages will be enabled for a given
     // optimization level by default.
     static ArithmeticOptimizerOptions Default(
         RewriterConfig::Toggle opt_level) {
       ArithmeticOptimizerOptions options;
+      if (opt_level == RewriterConfig::AGGRESSIVE) {
+        options.hoist_unary_out_of_concat = true;
+      }
       return options;
     }
   };
-- 
GitLab


From 7b3f2cf2a8674fa1e8c24dfccd5f0d30ee78d08a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 15:28:22 -0700
Subject: [PATCH 436/450] Docs

PiperOrigin-RevId: 194307738
---
 tensorflow/contrib/lite/schema/schema.fbs | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 2b62c257d8..a65c2e0c70 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -435,21 +435,25 @@ table Operator {
   custom_options_format:CustomOptionsFormat;
 }
 
-// The root type, defining a model.
+// The root type, defining a subgraph, which typically represents an entire
+// model.
 table SubGraph {
-  // A list of all tensors used in this model.
+  // A list of all tensors used in this subgraph.
   tensors:[Tensor];
 
-  // Indices of the input tensors.
+  // Indices of the tensors that are inputs into this subgraph. Note this is
+  // the list of non-static tensors that feed into the subgraph for inference.
   inputs:[int];
 
-  // Indices of the output tensors.
+  // Indices of the tensors that are outputs out of this subgraph. Note this is
+  // the list of output tensors that are considered the product of the
+  // subgraph's inference.
   outputs:[int];
 
   // All operators, in execution order.
   operators:[Operator];
 
-  // Name of subgraph (used for debugging).
+  // Name of this subgraph (used for debugging).
   name:string;
 }
 
@@ -475,7 +479,10 @@ table Model {
   // A description of the model.
   description:string;
 
-  // Buffers of the model
+  // Buffers of the model.
+  // Note the 0th entry of this array must be an empty buffer (sentinel).
+  // This is a convention so that tensors without a buffer can provide 0 as
+  // their buffer.
   buffers:[Buffer];
 
 }
-- 
GitLab


From 1ab4ea34fca26974afbe078b7b9f8d44a9a58858 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 15:29:12 -0700
Subject: [PATCH 437/450] [XLA] Redesign: migrate tests [m-r].* and vector_ops
 test.

Deleted a map_test case about versioned handle since such case no longer exists.

PiperOrigin-RevId: 194307861
---
 tensorflow/compiler/xla/tests/BUILD           |   4 +-
 .../xla/tests/client_library_test_base.cc     |   6 +-
 .../xla/tests/client_library_test_base.h      |   4 +-
 tensorflow/compiler/xla/tests/map_test.cc     |  44 --------
 .../xla/tests/matrix_ops_simple_test.cc       |  18 +--
 .../xla/tests/multidimensional_slice_test.cc  |   6 +-
 tensorflow/compiler/xla/tests/params_test.cc  |  85 +++++++-------
 tensorflow/compiler/xla/tests/pred_test.cc    |  49 ++++-----
 tensorflow/compiler/xla/tests/prng_test.cc    |  31 +++---
 .../xla/tests/query_inferred_shape_test.cc    |   8 +-
 tensorflow/compiler/xla/tests/reduce_test.cc  | 104 +++++++++---------
 tensorflow/compiler/xla/tests/replay_test.cc  |  38 +++----
 .../compiler/xla/tests/reshape_motion_test.cc |   5 +-
 tensorflow/compiler/xla/tests/reverse_test.cc |   4 +-
 .../xla/tests/vector_ops_reduce_test.cc       |   8 +-
 .../xla/tests/vector_ops_simple_test.cc       |  69 ++++++------
 16 files changed, 212 insertions(+), 271 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index ca8b3f9ffc..aec926d44b 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1694,13 +1694,11 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
-        "//tensorflow/compiler/xla/service:session_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index 541de5ae3b..22660c35dc 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -466,7 +466,7 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
 }
 
 void ClientLibraryTestBase::ComputeAndCompareR1U8(
-    ComputationBuilder* builder, tensorflow::StringPiece expected,
+    XlaBuilder* builder, tensorflow::StringPiece expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   auto actual_status = ExecuteAndTransfer(builder, arguments);
   EXPECT_IS_OK(actual_status.status());
@@ -640,8 +640,8 @@ XlaComputation ClientLibraryTestBase::CreateScalarRelu() {
   return computation_status.ConsumeValueOrDie();
 }
 
-Computation ClientLibraryTestBase::CreateScalarMax() {
-  ComputationBuilder builder(client_, "max");
+XlaComputation ClientLibraryTestBase::CreateScalarMax() {
+  XlaBuilder builder("max");
   auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {});
   auto x = builder.Parameter(0, shape, "x");
   auto y = builder.Parameter(1, shape, "y");
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index c303a4562e..32eea7c2f3 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -222,7 +222,7 @@ class ClientLibraryTestBase : public ::testing::Test {
   // Compare the result of the computation to a strings. In XLA strings are
   // represented using rank-1 U8 shapes.
   void ComputeAndCompareR1U8(
-      ComputationBuilder* builder, tensorflow::StringPiece expected,
+      XlaBuilder* builder, tensorflow::StringPiece expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments);
 
   // Convenience method for running a built computation, transferring the
@@ -256,7 +256,7 @@ class ClientLibraryTestBase : public ::testing::Test {
 
   // Create scalar operations for use in reductions.
   XlaComputation CreateScalarRelu();
-  Computation CreateScalarMax();
+  XlaComputation CreateScalarMax();
   Computation CreateScalarReluSensitivity();
 
   // Special case convenience functions for creating filled arrays.
diff --git a/tensorflow/compiler/xla/tests/map_test.cc b/tensorflow/compiler/xla/tests/map_test.cc
index 8fabcaca1b..7df45bebeb 100644
--- a/tensorflow/compiler/xla/tests/map_test.cc
+++ b/tensorflow/compiler/xla/tests/map_test.cc
@@ -16,8 +16,6 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
@@ -341,48 +339,6 @@ XLA_TEST_F(MapTest, ComplexNestedMaps) {
   ComputeAndCompareR0<float>(&builder, 73.0, {}, ErrorSpec(0.01f));
 }
 
-TEST_F(MapTest, VersionedEmbeddedComputation) {
-  // Build a computation X, use it in a map, then add an additional operation to
-  // computation X and use it again in a different map. Verify that the proper
-  // versions of computation X are used in each of the maps.
-
-  // Create a (embedded) computation which adds one to its parameter argument.
-  ComputationBuilder embedded_builder(client_, "EmbeddedComputation");
-  auto param_0 =
-      embedded_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param0");
-  auto constant_one = embedded_builder.ConstantR0<float>(1.0);
-  auto adder_to_one = embedded_builder.Add(param_0, constant_one);
-  auto computation_status = embedded_builder.Build();
-  ASSERT_IS_OK(computation_status.status());
-  auto embedded_computation = computation_status.ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto constant_vector = builder.ConstantR1<float>({1.0, 2.0, 3.0, 4.0});
-  auto map_plus_1 = builder.Map({constant_vector}, embedded_computation, {0});
-
-  // Add another Add(1) operation to the existing embedded computation. This
-  // requires using the stub interface because the ComputationBuilder does not
-  // allow modification to the XlaComputation objects after they have been
-  // built.
-  BinaryOpRequest request;
-  request.set_binop(BINOP_ADD);
-  *request.mutable_lhs() = adder_to_one;
-  *request.mutable_rhs() = constant_one;
-  OpRequest op_request;
-  *op_request.mutable_computation() = embedded_computation.handle();
-  *op_request.mutable_binary_op_request() = request;
-  OpResponse response;
-  tensorflow::Status s = client_->stub()->Op(&op_request, &response);
-  ASSERT_TRUE(s.ok());
-
-  auto map_plus_2 = builder.Map({map_plus_1}, embedded_computation, {0});
-
-  // The original vector has Add(1) applied to it with a map, followed by
-  // Add(1+1) resulting in a net Add(3).
-  ComputeAndCompareR1<float>(&builder, {4.0, 5.0, 6.0, 7.0}, {},
-                             ErrorSpec(0.01f));
-}
-
 TEST_F(MapTest, MapBinaryAdder) {
   // Maps (lambda (x y) (+ x y)) onto two R1F32 vectors.
   XlaBuilder builder(TestName());
diff --git a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
index c42f71388b..7fa61eb33c 100644
--- a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
@@ -19,8 +19,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -60,7 +61,7 @@ TYPED_TEST_CASE(MatOpsSimpleTest_F16F32, TypesF16F32);
 
 XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, ExpTwoByTwoValues) {
   using T = TypeParam;
-  ComputationBuilder builder(this->client_, "exp_2x2");
+  XlaBuilder builder("exp_2x2");
   auto data = builder.ConstantR2FromArray2D<T>({
       {1.0f, 0.0f},   // row 0
       {-1.0f, 0.5f},  // row 1
@@ -77,10 +78,10 @@ XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, ExpTwoByTwoValues) {
 
 XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, MapTwoByTwo) {
   using T = TypeParam;
-  Computation add_half;
+  XlaComputation add_half;
   {
     // add_half(x) = x + 0.5
-    ComputationBuilder builder(this->client_, "add_half");
+    XlaBuilder builder("add_half");
     auto x_value =
         builder.Parameter(0, ShapeUtil::MakeShapeWithType<T>({}), "x_value");
     auto half = builder.ConstantR0<T>(static_cast<T>(0.5));
@@ -90,7 +91,7 @@ XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, MapTwoByTwo) {
     add_half = computation_status.ConsumeValueOrDie();
   }
 
-  ComputationBuilder builder(this->client_, "map_2x2");
+  XlaBuilder builder("map_2x2");
   auto data = builder.ConstantR2FromArray2D<T>({
       {1.0f, 0.0f},   // row 0
       {-1.0f, 0.5f},  // row 1
@@ -106,7 +107,7 @@ XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, MapTwoByTwo) {
 
 XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, MaxTwoByTwoValues) {
   using T = TypeParam;
-  ComputationBuilder builder(this->client_, "max_2x2");
+  XlaBuilder builder("max_2x2");
   auto lhs = builder.ConstantR2FromArray2D<T>({
       {7.0f, 2.0f},   // row 0
       {3.0f, -4.0f},  // row 1
@@ -143,8 +144,7 @@ class TestLinspaceMaxParametric
         MakeLinspaceArray2D<T>(from, to, rows, cols);
     auto arhs = MakeUnique<Array2D<T>>(rows, cols, static_cast<T>(1.0f));
 
-    ComputationBuilder builder(
-        client_,
+    XlaBuilder builder(
         tensorflow::strings::Printf("max_%lldx%lld_linspace", rows, cols));
     auto lhs = builder.ConstantR2FromArray2D<T>(*alhs);
     auto rhs = builder.ConstantR2FromArray2D<T>(*arhs);
@@ -219,7 +219,7 @@ class MatOpsDotAddTest
         client_->TransferToServer(*Literal::CreateR2FromArray2DWithLayout<T>(
             rhs, LayoutUtil::MakeLayout(minor_to_major(row_major)))));
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto lhs_arg = builder.Parameter(0, lhs_shape, "lhs");
     auto lhs_mat_arg = lhs_arg;
     if (transpose) {
diff --git a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
index 11c0bf7a5a..0791a71aac 100644
--- a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
+++ b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -32,7 +32,7 @@ namespace {
 class SliceTest : public ClientLibraryTestBase {};
 
 XLA_TEST_F(SliceTest, Slice2D) {
-  ComputationBuilder builder(client_, "slice_2d");
+  XlaBuilder builder("slice_2d");
   auto original = builder.ConstantR2<float>(
       {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}});
   builder.Slice(original, {2, 1}, {4, 3}, {1, 1});
@@ -42,7 +42,7 @@ XLA_TEST_F(SliceTest, Slice2D) {
 }
 
 XLA_TEST_F(SliceTest, Slice3D) {
-  ComputationBuilder builder(client_, "slice_3d");
+  XlaBuilder builder("slice_3d");
   Array3D<float> array_3d(
       {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}});
   auto original = builder.ConstantR3FromArray3D<float>(array_3d);
diff --git a/tensorflow/compiler/xla/tests/params_test.cc b/tensorflow/compiler/xla/tests/params_test.cc
index bb7e800df8..97dab860c0 100644
--- a/tensorflow/compiler/xla/tests/params_test.cc
+++ b/tensorflow/compiler/xla/tests/params_test.cc
@@ -20,9 +20,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -41,7 +42,7 @@ namespace {
 class ParamsTest : public ClientLibraryTestBase {};
 
 XLA_TEST_F(ParamsTest, ConstantR0F32Param) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR0<float>(3.14159f);
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
@@ -53,7 +54,7 @@ XLA_TEST_F(ParamsTest, ConstantR0F32Param) {
 }
 
 XLA_TEST_F(ParamsTest, ConstantR1S0F32Param) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR1<float>({});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
@@ -65,7 +66,7 @@ XLA_TEST_F(ParamsTest, ConstantR1S0F32Param) {
 }
 
 XLA_TEST_F(ParamsTest, ConstantR1S2F32Param) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({3.14f, -100.25f});
   std::unique_ptr<GlobalData> param0_data =
@@ -78,7 +79,7 @@ XLA_TEST_F(ParamsTest, ConstantR1S2F32Param) {
 }
 
 XLA_TEST_F(ParamsTest, ConstantR1U8Param) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   string str("hello world");
   std::unique_ptr<Literal> param0_literal = Literal::CreateR1U8(str);
   std::unique_ptr<GlobalData> param0_data =
@@ -91,7 +92,7 @@ XLA_TEST_F(ParamsTest, ConstantR1U8Param) {
 }
 
 XLA_TEST_F(ParamsTest, ConstantR2_3x0_F32Param) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR2FromArray2D<float>(Array2D<float>(3, 0));
   std::unique_ptr<GlobalData> param0_data =
@@ -104,7 +105,7 @@ XLA_TEST_F(ParamsTest, ConstantR2_3x0_F32Param) {
 }
 
 XLA_TEST_F(ParamsTest, ConstantR2F32Param) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR2<float>(
       {{3.14f, -100.25f}, {7e8f, 7e-9f}, {30.3f, -100.0f}});
   std::unique_ptr<GlobalData> param0_data =
@@ -119,7 +120,7 @@ XLA_TEST_F(ParamsTest, ConstantR2F32Param) {
 }
 
 XLA_TEST_F(ParamsTest, TwoParameters) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> literal0 = Literal::CreateR1<float>({1, 2});
   std::unique_ptr<GlobalData> param0_data =
@@ -156,19 +157,15 @@ XLA_TEST_F(ParamsTest, MissingParameter) {
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto p = builder.Parameter(2, ShapeUtil::MakeShape(F32, {}), "param2");
-  auto computation = builder.Build().ConsumeValueOrDie();
+  auto computation_status = builder.Build();
 
-  auto execute_status = client_->Execute(computation, {data.get(), data.get()},
-                                         /*execution_options=*/nullptr,
-                                         /*execution_profile=*/nullptr);
-  ASSERT_EQ(execute_status.status().code(),
-            tensorflow::error::FAILED_PRECONDITION);
+  ASSERT_NE(computation_status.status(), tensorflow::Status::OK());
 }
 
 XLA_TEST_F(ParamsTest, UnusedParameter) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> literal0 = Literal::CreateR1<float>({1, 2});
   std::unique_ptr<GlobalData> param0_data =
@@ -188,7 +185,7 @@ XLA_TEST_F(ParamsTest, UnusedParameter) {
 XLA_TEST_F(ParamsTest, UnusedParametersInUnusedExpression) {
   // Build a computation with a couple unused parameters which are used in an
   // unused expression.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> literal0 = Literal::CreateR1<float>({1, 2});
   std::unique_ptr<GlobalData> param0_data =
@@ -214,12 +211,12 @@ XLA_TEST_F(ParamsTest, UnusedParametersInUnusedExpression) {
 }
 
 XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   constexpr int size = 8 * 128 * 2;
 
   std::vector<float> init_value = {{0, 1}};
   init_value.resize(size);
-  ComputationDataHandle sum_handle = builder.ConstantR1<float>(init_value);
+  XlaOp sum_handle = builder.ConstantR1<float>(init_value);
   std::vector<float> sum = {{0, 1}};
   sum.resize(size);
 
@@ -237,8 +234,7 @@ XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) {
     std::unique_ptr<Literal> literal = Literal::CreateR1<float>(sum_value);
     param_data_owner.push_back(
         client_->TransferToServer(*literal).ConsumeValueOrDie());
-    ComputationDataHandle param =
-        builder.Parameter(i, literal->shape(), "param");
+    XlaOp param = builder.Parameter(i, literal->shape(), "param");
     sum_handle = builder.Add(sum_handle, param);
   }
 
@@ -262,10 +258,10 @@ XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) {
 // compilation.
 XLA_TEST_F(ParamsTest,
            DISABLED_ON_CPU(DISABLED_ON_GPU(ThreeThousandParameters))) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<std::unique_ptr<GlobalData>> param_data_owner;
-  ComputationDataHandle sum_handle = builder.ConstantR0<float>(0.0f);
+  XlaOp sum_handle = builder.ConstantR0<float>(0.0f);
   float target = 0.0;
   constexpr int kParamCount = 3000;
   for (int i = 0; i < kParamCount; ++i) {
@@ -273,8 +269,7 @@ XLA_TEST_F(ParamsTest,
     std::unique_ptr<Literal> literal = Literal::CreateR0<float>(i);
     param_data_owner.push_back(
         std::move(client_->TransferToServer(*literal)).ValueOrDie());
-    ComputationDataHandle param =
-        builder.Parameter(i, literal->shape(), "param");
+    XlaOp param = builder.Parameter(i, literal->shape(), "param");
     sum_handle = builder.Add(sum_handle, param);
   }
 
@@ -294,25 +289,24 @@ XLA_TEST_F(ParamsTest,
 // compilation.
 XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(DISABLED_ON_GPU(
                            ThreeThousandParametersAndOutputElements))) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<std::unique_ptr<GlobalData>> param_data_owner;
-  ComputationDataHandle sum_handle = builder.ConstantR1<int32>({0, 0});
+  XlaOp sum_handle = builder.ConstantR1<int32>({0, 0});
   int32 target = 0;
   constexpr int kParamCount = 3000;
-  std::vector<ComputationDataHandle> params;
+  std::vector<XlaOp> params;
   for (int i = 0; i < kParamCount; ++i) {
     target += i;
     std::unique_ptr<Literal> literal = Literal::CreateR1<int32>({i, i});
     param_data_owner.push_back(
         std::move(client_->TransferToServer(*literal)).ValueOrDie());
-    ComputationDataHandle param =
-        builder.Parameter(i, literal->shape(), "param");
+    XlaOp param = builder.Parameter(i, literal->shape(), "param");
     params.push_back(param);
     sum_handle = builder.Add(sum_handle, param);
   }
 
-  std::vector<ComputationDataHandle> outputs;
+  std::vector<XlaOp> outputs;
   for (int i = 0; i < kParamCount; ++i) {
     outputs.push_back(builder.Add(params[i], sum_handle));
   }
@@ -353,18 +347,17 @@ XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(DISABLED_ON_GPU(
 // 2017-12-12.
 XLA_TEST_F(ParamsTest,
            DISABLED_ON_CPU(DISABLED_ON_GPU(ManyParametersIntoWhileLoop))) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector<std::unique_ptr<GlobalData>> param_data_owner;
   constexpr int kParamCount = 1900;
-  std::vector<ComputationDataHandle> params;
+  std::vector<XlaOp> params;
   std::vector<Shape> parameter_shapes;
   for (int i = 0; i < kParamCount; ++i) {
     std::unique_ptr<Literal> literal = Literal::CreateR1<int32>({i, i});
     param_data_owner.push_back(
         std::move(client_->TransferToServer(*literal)).ValueOrDie());
-    ComputationDataHandle param =
-        builder.Parameter(i, literal->shape(), "param");
+    XlaOp param = builder.Parameter(i, literal->shape(), "param");
     params.push_back(param);
     parameter_shapes.push_back(literal->shape());
   }
@@ -374,7 +367,7 @@ XLA_TEST_F(ParamsTest,
   std::unique_ptr<Literal> bool_literal = Literal::CreateR0<bool>(false);
   param_data_owner.push_back(
       std::move(client_->TransferToServer(*bool_literal)).ValueOrDie());
-  ComputationDataHandle bool_param =
+  XlaOp bool_param =
       builder.Parameter(kParamCount, bool_literal->shape(), "bool_param");
   params.push_back(bool_param);
   parameter_shapes.push_back(bool_literal->shape());
@@ -383,9 +376,9 @@ XLA_TEST_F(ParamsTest,
 
   // Create a computation for the condition: while(bool_param).
   Shape while_shape = ShapeUtil::MakeTupleShape(parameter_shapes);
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto condition_parameter =
         builder.Parameter(0, while_shape, "condition_parameter");
     builder.GetTupleElement(condition_parameter, kParamCount);
@@ -394,11 +387,11 @@ XLA_TEST_F(ParamsTest,
 
   // Create a computation for the body.
   // Add {1, 1} to the each tuple element.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto body_parameter = builder.Parameter(0, while_shape, "body_parameter");
-    std::vector<ComputationDataHandle> updates;
+    std::vector<XlaOp> updates;
     for (int i = 0; i < kParamCount; ++i) {
       auto add = builder.Add(builder.GetTupleElement(body_parameter, i),
                              builder.ConstantR1<int32>({1, 1}));
@@ -413,7 +406,7 @@ XLA_TEST_F(ParamsTest,
 
   auto loop = builder.While(condition, body, init);
 
-  std::vector<ComputationDataHandle> outputs;
+  std::vector<XlaOp> outputs;
   for (int i = 0; i < kParamCount; ++i) {
     outputs.push_back(builder.GetTupleElement(loop, i));
   }
@@ -437,7 +430,7 @@ XLA_TEST_F(ParamsTest,
 #endif
 
 XLA_TEST_F(ParamsTest, TupleOfR1ParametersAddedTogether) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Shape r1f32_3 = ShapeUtil::MakeShape(F32, {3});
   Shape tuple_shape = ShapeUtil::MakeTupleShape({r1f32_3, r1f32_3});
@@ -464,7 +457,7 @@ XLA_TEST_F(ParamsTest, TupleOfR1ParametersAddedTogether) {
 XLA_TEST_F(ParamsTest, R2_2x2_Layout_01) {
   std::unique_ptr<Literal> literal = Literal::CreateR2WithLayout<float>(
       {{1, 2}, {3, 4}}, LayoutUtil::MakeLayout({0, 1}));
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Parameter(0, literal->shape(), "input");
 
   std::unique_ptr<GlobalData> data =
@@ -476,7 +469,7 @@ XLA_TEST_F(ParamsTest, R2_2x2_Layout_01) {
 XLA_TEST_F(ParamsTest, R2_2x2_Layout_10) {
   std::unique_ptr<Literal> literal = Literal::CreateR2WithLayout<float>(
       {{1, 3}, {2, 4}}, LayoutUtil::MakeLayout({1, 0}));
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Parameter(0, literal->shape(), "input");
 
   std::unique_ptr<GlobalData> data =
@@ -501,7 +494,7 @@ XLA_TEST_F(ParamsTest, R2_2x2_TryToPassReverseLayoutToParameter) {
     ASSERT_EQ(2, literal->Get<float>({0, 1}));
   }
   // Use the original shape in building the computation.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.Parameter(0, original, "input");
   // Use the slice operator to get an off-diagonal element.
   builder.Slice(input, {0, 1}, {1, 2}, {1, 1});
diff --git a/tensorflow/compiler/xla/tests/pred_test.cc b/tensorflow/compiler/xla/tests/pred_test.cc
index 10e44b274a..77159efb26 100644
--- a/tensorflow/compiler/xla/tests/pred_test.cc
+++ b/tensorflow/compiler/xla/tests/pred_test.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -29,63 +29,62 @@ namespace {
 
 class PredTest : public ClientLibraryTestBase {
  protected:
-  void TestCompare(bool lhs, bool rhs, bool expected,
-                   ComputationDataHandle (ComputationBuilder::*op)(
-                       const ComputationDataHandle&,
-                       const ComputationDataHandle&,
-                       tensorflow::gtl::ArraySlice<int64>)) {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle lhs_op = builder.ConstantR0<bool>(lhs);
-    ComputationDataHandle rhs_op = builder.ConstantR0<bool>(rhs);
-    ComputationDataHandle result = (builder.*op)(lhs_op, rhs_op, {});
+  void TestCompare(
+      bool lhs, bool rhs, bool expected,
+      XlaOp (XlaBuilder::*op)(const xla::XlaOp&, const xla::XlaOp&,
+                              tensorflow::gtl::ArraySlice<int64>)) {
+    XlaBuilder builder(TestName());
+    XlaOp lhs_op = builder.ConstantR0<bool>(lhs);
+    XlaOp rhs_op = builder.ConstantR0<bool>(rhs);
+    XlaOp result = (builder.*op)(lhs_op, rhs_op, {});
     ComputeAndCompareR0<bool>(&builder, expected, {});
   }
 };
 
 TEST_F(PredTest, ConstantR0PredTrue) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR0<bool>(true);
   ComputeAndCompareR0<bool>(&builder, true, {});
 }
 
 TEST_F(PredTest, ConstantR0PredFalse) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR0<bool>(false);
   ComputeAndCompareR0<bool>(&builder, false, {});
 }
 
 TEST_F(PredTest, ConstantR0PredCompareEq) {
-  TestCompare(true, false, false, &ComputationBuilder::Eq);
+  TestCompare(true, false, false, &XlaBuilder::Eq);
 }
 
 TEST_F(PredTest, ConstantR0PredCompareNe) {
-  TestCompare(true, false, true, &ComputationBuilder::Ne);
+  TestCompare(true, false, true, &XlaBuilder::Ne);
 }
 
 TEST_F(PredTest, ConstantR0PredCompareLe) {
-  TestCompare(true, false, false, &ComputationBuilder::Le);
+  TestCompare(true, false, false, &XlaBuilder::Le);
 }
 
 TEST_F(PredTest, ConstantR0PredCompareLt) {
-  TestCompare(true, false, false, &ComputationBuilder::Lt);
+  TestCompare(true, false, false, &XlaBuilder::Lt);
 }
 
 TEST_F(PredTest, ConstantR0PredCompareGe) {
-  TestCompare(true, false, true, &ComputationBuilder::Ge);
+  TestCompare(true, false, true, &XlaBuilder::Ge);
 }
 
 TEST_F(PredTest, ConstantR0PredCompareGt) {
-  TestCompare(true, false, true, &ComputationBuilder::Gt);
+  TestCompare(true, false, true, &XlaBuilder::Gt);
 }
 
 TEST_F(PredTest, ConstantR1Pred) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({true, false, false, true});
   ComputeAndCompareR1<bool>(&builder, {true, false, false, true}, {});
 }
 
 TEST_F(PredTest, ConstantR2Pred) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a =
       builder.ConstantR2<bool>({{false, true, true}, {true, false, false}});
   const string expected = R"(pred[2,3] {
@@ -96,28 +95,28 @@ TEST_F(PredTest, ConstantR2Pred) {
 }
 
 TEST_F(PredTest, AnyR1True) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({true, false});
   TF_ASSERT_OK(Any(a, &builder).status());
   ComputeAndCompareR0<bool>(&builder, true, {});
 }
 
 TEST_F(PredTest, AnyR1False) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({false, false});
   TF_ASSERT_OK(Any(a, &builder).status());
   ComputeAndCompareR0<bool>(&builder, false, {});
 }
 
 TEST_F(PredTest, AnyR1VacuouslyFalse) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({});
   TF_ASSERT_OK(Any(a, &builder).status());
   ComputeAndCompareR0<bool>(&builder, false, {});
 }
 
 TEST_F(PredTest, AnyR2True) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<bool>({
       {false, false, false},
       {false, false, false},
@@ -128,7 +127,7 @@ TEST_F(PredTest, AnyR2True) {
 }
 
 TEST_F(PredTest, AnyR2False) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<bool>({
       {false, false, false},
       {false, false, false},
diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc
index 733d89fdcc..29a4f75001 100644
--- a/tensorflow/compiler/xla/tests/prng_test.cc
+++ b/tensorflow/compiler/xla/tests/prng_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include <limits>
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -52,13 +52,14 @@ class PrngTest : public ClientLibraryTestBase {
 template <typename T>
 std::unique_ptr<Literal> PrngTest::UniformTest(
     T a, T b, tensorflow::gtl::ArraySlice<int64> dims, int64 seed) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.RngUniform(
       builder.ConstantR0<T>(a), builder.ConstantR0<T>(b),
       ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<T>(), dims));
 
   SetSeed(seed);
-  auto actual = ExecuteAndTransferOrDie(&builder, /*arguments=*/{});
+  auto actual =
+      ExecuteAndTransfer(&builder, /*arguments=*/{}).ConsumeValueOrDie();
   EXPECT_THAT(dims, ::testing::ElementsAreArray(actual->shape().dimensions()));
   actual->EachCell<T>([=](tensorflow::gtl::ArraySlice<int64>, T value) {
     EXPECT_LE(a, value);
@@ -139,13 +140,14 @@ double PrngTest::UniformChiSquared(int32 range_size, int32 expected_count,
                                    int64 seed) {
   int32 sample_size = range_size * expected_count;
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.RngUniform(builder.ConstantR0<int32>(0),
                      builder.ConstantR0<int32>(range_size),
                      ShapeUtil::MakeShape(S32, {sample_size}));
 
   SetSeed(seed);
-  auto actual = ExecuteAndTransferOrDie(&builder, /*arguments=*/{});
+  auto actual =
+      ExecuteAndTransfer(&builder, /*arguments=*/{}).ConsumeValueOrDie();
   std::vector<int32> counts(range_size, 0);
   actual->EachCell<int32>([&counts](tensorflow::gtl::ArraySlice<int64>,
                                     int32 value) { ++counts[value]; });
@@ -180,16 +182,15 @@ XLA_TEST_F(PrngTest, Uniformity256) {
 
 XLA_TEST_F(PrngTest, MapUsingRng) {
   // Build a x -> (x + U[0,1)) computation.
-  auto build_sum_rng = [this](ComputationBuilder& builder) {
+  auto build_sum_rng = [this](XlaBuilder& builder) {
     auto b = builder.CreateSubBuilder("sum_with_rng");
     auto x = b->Parameter(0, ShapeUtil::MakeShape(F32, {}), "input");
-    b->Add(x,
-           b->RngUniform(b->ConstantR0<float>(0), b->ConstantR0<float>(1),
-                         ShapeUtil::MakeShape(F32, {})));
+    b->Add(x, b->RngUniform(b->ConstantR0<float>(0), b->ConstantR0<float>(1),
+                            ShapeUtil::MakeShape(F32, {})));
     return b->BuildAndNoteError();
   };
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 5.3f, 4.4f, 5.5f});
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> param0_data,
@@ -224,7 +225,7 @@ XLA_TEST_F(PrngTest, MapUsingRng) {
 XLA_TEST_F(PrngTest, PassInGlobalRngSeed) {
   // Build a U[0,1) computation.
   auto build_computation = [this]() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     builder.RngUniform(builder.ConstantR0<float>(0),
                        builder.ConstantR0<float>(1),
                        ShapeUtil::MakeShape(F32, {10}));
@@ -280,24 +281,24 @@ XLA_TEST_F(PrngTest, PassInGlobalRngSeed) {
 }
 
 XLA_TEST_F(PrngTest, TenValuesN01) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.RngNormal(builder.ConstantR0<float>(0), builder.ConstantR0<float>(1),
                     ShapeUtil::MakeShape(F32, {10}));
 
   SetSeed(42);
-  ExecuteAndTransferOrDie(&builder, /*arguments=*/{});
+  ExecuteAndTransfer(&builder, /*arguments=*/{}).ConsumeValueOrDie();
   // TODO(b/25995601): Test that resultant values are reasonable
 }
 
 XLA_TEST_F(PrngTest, RngUniformCrash) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   // This used to crash XLA during LLVM IR generation for CPUs.
   auto rng_uniform = builder.RngUniform(builder.ConstantR0<int32>(0),
                                         builder.ConstantR0<int32>(1000 * 1000),
                                         ShapeUtil::MakeShape(S32, {}));
   SetSeed(0);
-  ExecuteAndTransferOrDie(&builder, /*arguments=*/{});
+  ExecuteAndTransfer(&builder, /*arguments=*/{}).ConsumeValueOrDie();
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc b/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc
index 212512207c..f95e756483 100644
--- a/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc
+++ b/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -30,13 +30,13 @@ namespace {
 class QueryInferredShapeTest : public ClientLibraryTestBase {};
 
 TEST_F(QueryInferredShapeTest, OnePlusOneShape) {
-  ComputationBuilder builder(client_, "one_plus_one");
+  XlaBuilder builder("one_plus_one");
   auto one = builder.ConstantR0<float>(1.0);
   auto result = builder.Add(one, one);
-  StatusOr<std::unique_ptr<Shape>> shape_status = builder.GetShape(result);
+  StatusOr<Shape> shape_status = builder.GetShape(result);
   ASSERT_IS_OK(shape_status.status());
   auto shape = shape_status.ConsumeValueOrDie();
-  ASSERT_TRUE(ShapeUtil::Equal(*shape, ShapeUtil::MakeShape(F32, {})));
+  ASSERT_TRUE(ShapeUtil::Equal(shape, ShapeUtil::MakeShape(F32, {})));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index 423ccadb5b..bcc05c2d41 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
@@ -60,10 +59,9 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using FuncGeneratorForType = Computation (*)(PrimitiveType,
-                                             ComputationBuilder*);
+using FuncGeneratorForType = XlaComputation (*)(PrimitiveType, XlaBuilder*);
 
-using FuncGenerator = Computation (*)(ComputationBuilder*);
+using FuncGenerator = XlaComputation (*)(XlaBuilder*);
 
 class ReduceTest : public ClientLibraryTestBase {
  protected:
@@ -89,8 +87,8 @@ class ReduceTest : public ClientLibraryTestBase {
 
   // Runs an R1 => R0 reduction test with the given number of elements.
   void RunR1ToR0Test(int64 element_count) {
-    ComputationBuilder builder(client_, TestName());
-    Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+    XlaBuilder builder(TestName());
+    XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder);
     const Shape input_shape = ShapeUtil::MakeShape(F32, {element_count});
     auto input = builder.Parameter(0, input_shape, "input");
     auto zero = builder.ConstantR0<float>(0.0);
@@ -119,13 +117,13 @@ class ReduceTest : public ClientLibraryTestBase {
   void RunR1ToR0PredTest(bool and_reduce,
                          tensorflow::gtl::ArraySlice<int> input_data) {
     const int element_count = input_data.size();
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     const Shape input_shape = ShapeUtil::MakeShape(S32, {element_count});
     auto input_par = builder.Parameter(0, input_shape, "input");
     auto pred_values =
         builder.Eq(input_par, builder.ConstantR1<int>(element_count, 1));
-    ComputationDataHandle init_value;
-    Computation reduce;
+    XlaOp init_value;
+    XlaComputation reduce;
     if (and_reduce) {
       init_value = builder.ConstantR0<bool>(true);
       reduce = CreateScalarAndComputation(&builder);
@@ -157,13 +155,13 @@ class ReduceTest : public ClientLibraryTestBase {
   template <int64 cols>
   void RunR2ToR1PredTest(bool and_reduce, int64 rows, int64 minor = 1,
                          int64 major = 0) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     const Shape input_shape = ShapeUtil::MakeShape(U8, {rows, cols});
     auto input = builder.Parameter(0, input_shape, "input");
     auto input_pred = builder.Eq(input, builder.ConstantR0<uint8>(1));
 
-    ComputationDataHandle init_value;
-    Computation reduce_op;
+    XlaOp init_value;
+    XlaComputation reduce_op;
     if (and_reduce) {
       init_value = builder.ConstantR0<bool>(true);
       reduce_op = CreateScalarAndComputation(&builder);
@@ -202,8 +200,8 @@ class ReduceTest : public ClientLibraryTestBase {
 
   // Runs an R2 => R0 reduction test with the given number of (rows, cols).
   void RunR2ToR0Test(int64 rows, int64 cols, int64 minor = 1, int64 major = 0) {
-    ComputationBuilder builder(client_, TestName());
-    Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+    XlaBuilder builder(TestName());
+    XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder);
     const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols});
     auto input = builder.Parameter(0, input_shape, "input");
     auto zero = builder.ConstantR0<float>(0.0);
@@ -230,8 +228,8 @@ class ReduceTest : public ClientLibraryTestBase {
 
   // Runs an R2 => R1 reduction test with the given number of (rows, cols).
   void RunR2ToR1Test(int64 rows, int64 cols, int64 minor = 1, int64 major = 0) {
-    ComputationBuilder builder(client_, TestName());
-    Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+    XlaBuilder builder(TestName());
+    XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder);
     const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols});
     auto input = builder.Parameter(0, input_shape, "input");
     auto zero = builder.ConstantR0<float>(0.0);
@@ -261,7 +259,7 @@ class ReduceTest : public ClientLibraryTestBase {
   template <typename NativeT>
   void ComputeAndCompareGeneric(
       typename std::enable_if<std::is_floating_point<NativeT>::value,
-                              ComputationBuilder>::type* builder,
+                              XlaBuilder>::type* builder,
       tensorflow::gtl::ArraySlice<NativeT> expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
     ComputeAndCompareR1<NativeT>(builder, expected, arguments,
@@ -271,7 +269,7 @@ class ReduceTest : public ClientLibraryTestBase {
   template <typename NativeT>
   void ComputeAndCompareGeneric(
       typename std::enable_if<std::is_integral<NativeT>::value,
-                              ComputationBuilder>::type* builder,
+                              XlaBuilder>::type* builder,
       tensorflow::gtl::ArraySlice<NativeT> expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
     ComputeAndCompareR1<NativeT>(builder, expected, arguments);
@@ -279,15 +277,15 @@ class ReduceTest : public ClientLibraryTestBase {
 
   template <typename NativeT>
   void RunVectorizedReduceTestForType(
-      const std::function<Computation(ComputationBuilder*)>&
+      const std::function<XlaComputation(XlaBuilder*)>&
           reduction_function_generator,
       const std::function<NativeT(NativeT, NativeT)>&
           reference_reduction_function,
       const NativeT& initial_value) {
     const int rows = 64, cols = 128;
     const int minor = 1, major = 0;
-    ComputationBuilder builder(client_, TestName());
-    Computation reduction_function = reduction_function_generator(&builder);
+    XlaBuilder builder(TestName());
+    XlaComputation reduction_function = reduction_function_generator(&builder);
     const Shape input_shape = ShapeUtil::MakeShape(
         xla::primitive_util::NativeToPrimitiveType<NativeT>(), {rows, cols});
     auto input = builder.Parameter(0, input_shape, "input");
@@ -322,7 +320,7 @@ class ReduceTest : public ClientLibraryTestBase {
   }
 
   void RunVectorizedReduceTest(
-      const std::function<Computation(PrimitiveType, ComputationBuilder*)>&
+      const std::function<XlaComputation(PrimitiveType, XlaBuilder*)>&
           reduction_function_generator_for_type,
       const std::function<float(float, float)>&
           reference_reduction_function_for_floats,
@@ -334,21 +332,21 @@ class ReduceTest : public ClientLibraryTestBase {
       uint32 unsigned_int_identity) {
     // Float version
     RunVectorizedReduceTestForType<float>(
-        [&](ComputationBuilder* builder) {
+        [&](XlaBuilder* builder) {
           return reduction_function_generator_for_type(F32, builder);
         },
         reference_reduction_function_for_floats, floating_point_identity);
 
     // Signed int version
     RunVectorizedReduceTestForType<int32>(
-        [&](ComputationBuilder* builder) {
+        [&](XlaBuilder* builder) {
           return reduction_function_generator_for_type(S32, builder);
         },
         reference_reduction_function_for_ints, signed_int_identity);
 
     // Unsigned int version
     RunVectorizedReduceTestForType<uint32>(
-        [&](ComputationBuilder* builder) {
+        [&](XlaBuilder* builder) {
           return reduction_function_generator_for_type(U32, builder);
         },
         reference_reduction_function_for_uints, unsigned_int_identity);
@@ -442,8 +440,8 @@ XLA_TEST_F(ReduceTest, OrReduceOnesAndZerosR1_10_Pred) {
 XLA_TEST_F(ReduceTest, ReduceElementwiseR2_111x50_To_R1) {
   const int64 rows = 111, cols = 50;
 
-  ComputationBuilder builder(client_, TestName());
-  Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+  XlaBuilder builder(TestName());
+  XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder);
   const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols});
   auto input = builder.Parameter(0, input_shape, "input");
   auto zero = builder.ConstantR0<float>(0.0);
@@ -473,8 +471,8 @@ XLA_TEST_F(ReduceTest, ReduceElementwiseR2_111x50_To_R1) {
 XLA_TEST_F(ReduceTest, TransposeAndReduceElementwiseR2_111x50_To_R1) {
   const int64 rows = 111, cols = 50;
 
-  ComputationBuilder builder(client_, TestName());
-  Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+  XlaBuilder builder(TestName());
+  XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder);
   const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols});
   auto input = builder.Parameter(0, input_shape, "input");
   auto zero = builder.ConstantR0<float>(0.0);
@@ -522,8 +520,8 @@ XLA_TEST_F(ReduceTest, TransposeAndReduceR3_12x111x50_To_R2) {
 XLA_TEST_F(ReduceTest, Reshape_111x2x25Reduce_111x50_To_R1) {
   const int64 rows = 111, cols = 50;
 
-  ComputationBuilder builder(client_, TestName());
-  Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+  XlaBuilder builder(TestName());
+  XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder);
   const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, 2, cols / 2});
   auto input = builder.Parameter(0, input_shape, "input");
   auto zero = builder.ConstantR0<float>(0.0);
@@ -569,7 +567,7 @@ void PrintTo(const BoundsLayout& spec, std::ostream* os) {
 
 // Add-reduces a broadcasted scalar matrix among dimension 1 and 0.
 XLA_TEST_F(ReduceTest, AddReduce2DScalarToR0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto add = CreateScalarAddComputation(F32, &builder);
   auto scalar = builder.ConstantR0<float>(42.0);
   auto broadcasted = builder.Broadcast(scalar, {500, 500});
@@ -581,7 +579,7 @@ XLA_TEST_F(ReduceTest, AddReduce2DScalarToR0) {
 
 // Max-reduces a broadcasted scalar matrix among dimension 1 and 0.
 XLA_TEST_F(ReduceTest, MaxReduce2DScalarToR0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto max = CreateScalarMaxComputation(F32, &builder);
   auto scalar = builder.ConstantR0<float>(42.0);
   auto broadcasted = builder.Broadcast(scalar, {500, 500});
@@ -593,7 +591,7 @@ XLA_TEST_F(ReduceTest, MaxReduce2DScalarToR0) {
 
 // Max-reduces a matrix among dimension 1 and 0.
 XLA_TEST_F(ReduceTest, MaxReduce2DToR0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto max = CreateScalarMaxComputation(F32, &builder);
   Array2D<float> input(300, 250);
   input.FillRandom(214.0f);
@@ -608,7 +606,7 @@ XLA_TEST_F(ReduceTest, MaxReduce2DToR0) {
 
 // Min-reduces matrix among dimension 1 and 0.
 XLA_TEST_F(ReduceTest, MinReduce2DToR0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto min = CreateScalarMinComputation(F32, &builder);
   Array2D<float> input(150, 130);
   input.FillRandom(214.0f);
@@ -623,7 +621,7 @@ XLA_TEST_F(ReduceTest, MinReduce2DToR0) {
 }
 
 XLA_TEST_F(ReduceTest, UnsignedInt_MinReduce) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array2D<uint32> input({{1}, {2}});
   auto min = CreateScalarMinComputation(U32, &builder);
   auto input_literal = Literal::CreateR2FromArray2D(input);
@@ -636,7 +634,7 @@ XLA_TEST_F(ReduceTest, UnsignedInt_MinReduce) {
 }
 
 XLA_TEST_F(ReduceTest, UnsignedInt_MaxReduce) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array2D<uint32> input({{1}, {2}});
   auto max = CreateScalarMaxComputation(U32, &builder);
   auto input_literal = Literal::CreateR2FromArray2D(input);
@@ -650,7 +648,7 @@ XLA_TEST_F(ReduceTest, UnsignedInt_MaxReduce) {
 
 // Reduces a matrix among dimension 1.
 XLA_TEST_F(ReduceTest, Reduce2DAmong1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m = builder.ConstantLiteral(*literal_2d_);
   auto add = CreateScalarAddComputation(F32, &builder);
   builder.Reduce(m, builder.ConstantR0<float>(0.0f), add, {1});
@@ -661,7 +659,7 @@ XLA_TEST_F(ReduceTest, Reduce2DAmong1) {
 
 XLA_TEST_F(ReduceTest, Reduce2DAmong0and1) {
   // Reduce a matrix among dimensions 0 and 1 (sum it up to a scalar).
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m = builder.ConstantLiteral(*literal_2d_);
   auto add = CreateScalarAddComputation(F32, &builder);
   builder.Reduce(m, builder.ConstantR0<float>(0.0f), add, {0, 1});
@@ -671,7 +669,7 @@ XLA_TEST_F(ReduceTest, Reduce2DAmong0and1) {
 
 // Tests 2D matrix ReduceToRow operation.
 XLA_TEST_F(ReduceTest, Reduce2DAmongY) {
-  ComputationBuilder builder(client_, "reduce_among_y");
+  XlaBuilder builder("reduce_among_y");
   auto m = builder.ConstantLiteral(*literal_2d_);
   auto add = CreateScalarAddComputation(F32, &builder);
   builder.Reduce(m, builder.ConstantR0<float>(0.0f), add, {0});
@@ -681,7 +679,7 @@ XLA_TEST_F(ReduceTest, Reduce2DAmongY) {
 }
 
 XLA_TEST_F(ReduceTest, ReduceR3AmongDims_1_2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m = builder.ConstantLiteral(*literal_3d_);
   auto add = CreateScalarAddComputation(F32, &builder);
   builder.Reduce(m, builder.ConstantR0<float>(0.0f), add, {1, 2});
@@ -691,7 +689,7 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDims_1_2) {
 }
 
 XLA_TEST_F(ReduceTest, ReduceR3AmongDims_0_1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m = builder.ConstantLiteral(*literal_3d_);
   auto add = CreateScalarAddComputation(F32, &builder);
   builder.Reduce(m, builder.ConstantR0<float>(0.0f), add, {0, 1});
@@ -701,7 +699,7 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDims_0_1) {
 }
 
 XLA_TEST_F(ReduceTest, ReduceR3ToR0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m = builder.ConstantLiteral(*literal_3d_);
   auto add = CreateScalarAddComputation(F32, &builder);
   builder.Reduce(m, builder.ConstantR0<float>(0.0f), add, {0, 1, 2});
@@ -711,7 +709,7 @@ XLA_TEST_F(ReduceTest, ReduceR3ToR0) {
 }
 
 XLA_TEST_F(ReduceTest, ReduceR3AmongDim0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m = builder.ConstantLiteral(*literal_3d_);
   auto add = CreateScalarAddComputation(F32, &builder);
   builder.Reduce(m, builder.ConstantR0<float>(0.0f), add, {0});
@@ -726,7 +724,7 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDim0) {
 }
 
 XLA_TEST_F(ReduceTest, ReduceR3AmongDim1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m = builder.ConstantLiteral(*literal_3d_);
   auto add = CreateScalarAddComputation(F32, &builder);
   builder.Reduce(m, builder.ConstantR0<float>(0.0f), add, {1});
@@ -743,7 +741,7 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDim1) {
 }
 
 XLA_TEST_F(ReduceTest, ReduceR3AmongDim2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto m = builder.ConstantLiteral(*literal_3d_);
   auto add = CreateScalarAddComputation(F32, &builder);
   builder.Reduce(m, builder.ConstantR0<float>(0.0f), add, {2});
@@ -817,7 +815,7 @@ class ReduceR3ToR2Test : public ReduceTest,
                          public ::testing::WithParamInterface<BoundsLayout> {};
 
 XLA_TEST_P(ReduceR3ToR2Test, ReduceR3ToR2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   const auto& bounds = GetParam().bounds;
   Array3D<float> input_array(bounds[0], bounds[1], bounds[2]);
   //  input_array.FillRandom(3.14f, 0.05);
@@ -831,7 +829,7 @@ XLA_TEST_P(ReduceR3ToR2Test, ReduceR3ToR2) {
 
   auto input_activations =
       builder.Parameter(0, input_literal->shape(), "input");
-  Computation add = CreateScalarAddComputation(F32, &builder);
+  XlaComputation add = CreateScalarAddComputation(F32, &builder);
   auto sum = builder.Reduce(input_activations, builder.ConstantR0<float>(0.0f),
                             add, GetParam().reduce_dims);
 
@@ -871,8 +869,8 @@ INSTANTIATE_TEST_CASE_P(
 // IrEmitterUnnested::EmitInitializer() for the Reduce operator.  Failed on
 // 2017-07-26.
 XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(OperationOnConstantAsInitValue)) {
-  ComputationBuilder builder(client_, TestName());
-  Computation max_f32 = CreateScalarMaxComputation(F32, &builder);
+  XlaBuilder builder(TestName());
+  XlaComputation max_f32 = CreateScalarMaxComputation(F32, &builder);
 
   auto a = builder.ConstantR0<float>(2.0f);
   auto a2 = builder.Abs(a);
@@ -899,8 +897,8 @@ class ReduceInitializerTest : public ReduceTest {
  protected:
   template <typename T>
   void DoTest(T initializer, int num_elems) {
-    ComputationBuilder builder(client_, TestName());
-    Computation max_fn = CreateScalarMaxComputation(
+    XlaBuilder builder(TestName());
+    XlaComputation max_fn = CreateScalarMaxComputation(
         primitive_util::NativeToPrimitiveType<T>(), &builder);
 
     auto init = builder.ConstantR0<T>(initializer);
@@ -940,7 +938,7 @@ XLA_TEST_F(ReduceInitializerTest, U64InitializerBigValue) {
 // returns one of the parameters). In this case, we return the rhs, which for
 // a 1D array with one element, should not be the init value.
 XLA_TEST_F(ReduceTest, ReduceIdentity) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Shape single_float = ShapeUtil::MakeShape(F32, {});
   builder.Parameter(0, single_float, "lhs-unused");
   builder.Parameter(1, single_float, "rhs-used");
diff --git a/tensorflow/compiler/xla/tests/replay_test.cc b/tensorflow/compiler/xla/tests/replay_test.cc
index 6d063ffc36..36d763b0f7 100644
--- a/tensorflow/compiler/xla/tests/replay_test.cc
+++ b/tensorflow/compiler/xla/tests/replay_test.cc
@@ -15,13 +15,13 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -38,17 +38,17 @@ class ReplayTest : public ClientLibraryTestBase {};
 
 TEST_F(ReplayTest, TwoPlusTwoReplay) {
   // Make 2+2 computation.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto two = builder.ConstantR0<int32>(2);
   builder.Add(two, two);
-  Computation computation = builder.Build().ConsumeValueOrDie();
+  XlaComputation computation = builder.Build().ConsumeValueOrDie();
 
   // Serialize it out.
-  std::unique_ptr<SessionModule> module =
+  std::unique_ptr<HloSnapshot> module =
       computation.Snapshot().ConsumeValueOrDie();
 
   // Replay it.
-  Computation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie();
+  XlaComputation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie();
 
   // Check signature is the same.
   std::unique_ptr<ProgramShape> original_shape =
@@ -69,18 +69,18 @@ TEST_F(ReplayTest, TwoPlusTwoReplay) {
 
 XLA_TEST_F(ReplayTest, XPlusYReplayWithParameters) {
   // Make computation.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(S32, {}), "x");
   auto y = builder.Parameter(1, ShapeUtil::MakeShape(S32, {}), "y");
   builder.Add(x, y);
-  Computation computation = builder.Build().ConsumeValueOrDie();
+  XlaComputation computation = builder.Build().ConsumeValueOrDie();
 
   // Serialize it out.
-  std::unique_ptr<SessionModule> module =
+  std::unique_ptr<HloSnapshot> module =
       computation.Snapshot().ConsumeValueOrDie();
 
   // Replay it.
-  Computation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie();
+  XlaComputation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie();
 
   // Check signature is the same.
   std::unique_ptr<ProgramShape> original_shape =
@@ -109,24 +109,24 @@ XLA_TEST_F(ReplayTest, XPlusYReplayWithParameters) {
 
 TEST_F(ReplayTest, MapPlusTwoOverR1) {
   // As above, but with map(+2) over some constant array.
-  ComputationBuilder plus_two_builder(client_, "plus two");
+  XlaBuilder plus_two_builder("plus two");
   auto input =
       plus_two_builder.Parameter(0, ShapeUtil::MakeShape(S32, {}), "input");
   plus_two_builder.Add(input, plus_two_builder.ConstantR0<int32>(2));
-  Computation plus_two = plus_two_builder.Build().ConsumeValueOrDie();
+  XlaComputation plus_two = plus_two_builder.Build().ConsumeValueOrDie();
 
-  ComputationBuilder mapper_builder(client_, TestName());
+  XlaBuilder mapper_builder(TestName());
   auto original = mapper_builder.ConstantR1<int32>({1, 2, 3});
   mapper_builder.Map({original}, plus_two, {0});
 
-  Computation computation = mapper_builder.Build().ConsumeValueOrDie();
+  XlaComputation computation = mapper_builder.Build().ConsumeValueOrDie();
 
   // Serialize it out.
-  std::unique_ptr<SessionModule> module =
+  std::unique_ptr<HloSnapshot> module =
       computation.Snapshot().ConsumeValueOrDie();
 
   // Replay it.
-  Computation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie();
+  XlaComputation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie();
 
   // Check signature is the same.
   std::unique_ptr<ProgramShape> original_shape =
@@ -135,10 +135,6 @@ TEST_F(ReplayTest, MapPlusTwoOverR1) {
       client_->GetComputationShape(replayed).ConsumeValueOrDie();
   ASSERT_TRUE(protobuf_util::ProtobufEquals(*original_shape, *replayed_shape));
 
-  // Destroy the originals.
-  computation.Reset();
-  plus_two.Reset();
-
   // Run it.
   std::unique_ptr<Literal> literal =
       client_
diff --git a/tensorflow/compiler/xla/tests/reshape_motion_test.cc b/tensorflow/compiler/xla/tests/reshape_motion_test.cc
index e045e164e2..5ebd526899 100644
--- a/tensorflow/compiler/xla/tests/reshape_motion_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_motion_test.cc
@@ -20,10 +20,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -45,7 +44,7 @@ namespace {
 using ReshapeMotionTest = ClientLibraryTestBase;
 
 TEST_F(ReshapeMotionTest, ElementwiseOfReshapesWithNonSameInputShapes) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2<int32>({{2, 3, 5}, {7, 11, 13}});
   auto b = builder.ConstantR2<int32>({{17, 19}, {23, 29}, {31, 37}});
   auto c = builder.Reshape(a, {6});
diff --git a/tensorflow/compiler/xla/tests/reverse_test.cc b/tensorflow/compiler/xla/tests/reverse_test.cc
index 6959c95502..e7bd142dc9 100644
--- a/tensorflow/compiler/xla/tests/reverse_test.cc
+++ b/tensorflow/compiler/xla/tests/reverse_test.cc
@@ -114,7 +114,7 @@ class ReverseTest : public ClientLibraryTestBase {};
 
 // Tests the reverse operation on a 4D U8 array on dimension 0 and 3.
 XLA_TEST_F(ReverseTest, Reverse4DU8ArrayOnDim23) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   // Input shape is U8[1x2x3x4].
   // clang-format off
   Array4D<uint8> input({{
@@ -144,7 +144,7 @@ XLA_TEST_F(ReverseTest, Reverse4DU8ArrayOnDim23) {
 
 // Tests the reverse operation on a 4D float array on dimension 0 and 1.
 TEST_F(ReverseTest, Reverse4DFloatArrayOnDim01) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   // Input shape is float[4x3x2x1].
   // clang-format off
   Array4D<float> input({
diff --git a/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc b/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc
index 32ba067a10..82d301983f 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc
@@ -19,9 +19,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -33,9 +33,9 @@ namespace {
 
 class VecOpsReduceTest : public ClientLibraryTestBase {
  public:
-  VecOpsReduceTest() : builder_(client_, TestName()) {}
+  VecOpsReduceTest() : builder_(TestName()) {}
 
-  ComputationDataHandle BuildSampleConstantCube() {
+  XlaOp BuildSampleConstantCube() {
     // clang-format off
     Array3D<float> x3d({
           {{1.0, 2.0, 3.0},   // | dim 1    // } plane 0 in dim 0
@@ -49,7 +49,7 @@ class VecOpsReduceTest : public ClientLibraryTestBase {
     return builder_.ConstantR3FromArray3D<float>(x3d);
   }
 
-  ComputationBuilder builder_;
+  XlaBuilder builder_;
   ErrorSpec errspec_{1e-3, 0};
 };
 
diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
index 8b86b5e760..3dded3f715 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
@@ -19,10 +19,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -49,7 +50,7 @@ class VecOpsSimpleTest : public ClientLibraryTestBase {
 };
 
 XLA_TEST_F(VecOpsSimpleTest, ExpTenValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
   auto exp = builder.Exp(x);
@@ -63,7 +64,7 @@ XLA_TEST_F(VecOpsSimpleTest, ExpTenValues) {
 
 XLA_TEST_F(VecOpsSimpleTest, ExpManyValues) {
   for (int count : {63, 64, 65, 127, 128, 129, 17 * 4096}) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     std::vector<float> exponents;
     exponents.reserve(count);
     for (int i = 0; i < count; ++i) {
@@ -84,7 +85,7 @@ XLA_TEST_F(VecOpsSimpleTest, ExpManyValues) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, ExpIn4D) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array4D<float> exponents(2, 2, 2, 2);
 
   std::vector<float> exponents_vector;
@@ -106,7 +107,7 @@ XLA_TEST_F(VecOpsSimpleTest, ExpIn4D) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, NegateTenFloatValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
   builder.Neg(x);
@@ -117,7 +118,7 @@ XLA_TEST_F(VecOpsSimpleTest, NegateTenFloatValues) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, NegateTenInt32Values) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<int32>({2, -2, 12, -4, 5, 20, -15, 0, -2, 1});
   builder.Neg(x);
 
@@ -126,7 +127,7 @@ XLA_TEST_F(VecOpsSimpleTest, NegateTenInt32Values) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, NegateUint32Values) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<uint32>(
       {0, 1, 42, static_cast<uint32>(-1), static_cast<uint32>(-12)});
   builder.Neg(x);
@@ -136,7 +137,7 @@ XLA_TEST_F(VecOpsSimpleTest, NegateUint32Values) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, SquareTenValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
   builder.SquareF32(x);
@@ -147,7 +148,7 @@ XLA_TEST_F(VecOpsSimpleTest, SquareTenValues) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, ReciprocalTenValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
   builder.ReciprocalF32(x);
@@ -159,7 +160,7 @@ XLA_TEST_F(VecOpsSimpleTest, ReciprocalTenValues) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, SqrtZeroes) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>({0.0, -0.0});
   auto exp = builder.SqrtF32(x);
 
@@ -167,7 +168,7 @@ XLA_TEST_F(VecOpsSimpleTest, SqrtZeroes) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, SqrtSixValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>({16.0, 1.0, 1024.0, 0.16, 0.2, 12345});
   auto exp = builder.SqrtF32(x);
 
@@ -176,7 +177,7 @@ XLA_TEST_F(VecOpsSimpleTest, SqrtSixValues) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, InvSqrtSevenValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x =
       builder.ConstantR1<float>({16.0, 1.0, 1024.0, 0.16, 0.2, 12345, 1.2345});
   auto exp = builder.Pow(x, builder.ConstantR0<float>(-.5f));
@@ -188,7 +189,7 @@ XLA_TEST_F(VecOpsSimpleTest, InvSqrtSevenValues) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, AddTenValuesViaMap) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto add = CreateScalarAddComputation(F32, &builder);
 
   auto x = builder.ConstantR1<float>(
@@ -203,7 +204,7 @@ XLA_TEST_F(VecOpsSimpleTest, AddTenValuesViaMap) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, MaxTenValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
   auto y = builder.ConstantR1<float>(
@@ -218,8 +219,8 @@ XLA_TEST_F(VecOpsSimpleTest, MaxTenValues) {
 XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesFromParams) {
   // Similar to MaxTenValues, except that the inputs come from params rather
   // than constants.
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle v1, v2;
+  XlaBuilder builder(TestName());
+  XlaOp v1, v2;
   std::unique_ptr<GlobalData> param0_data = CreateR1Parameter<float>(
       {41.0f, 2.0f, 3.0f, 84.0f}, /*parameter_number=*/0, /*name=*/"v1",
       /*builder=*/&builder, /*data_handle=*/&v1);
@@ -236,7 +237,7 @@ XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesFromParams) {
 XLA_TEST_F(VecOpsSimpleTest, Max15000ValuesFromParams) {
   // Similar to MaxTenValuesFromParams, except that the data size passed in and
   // out is large.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   // Number of floats in the data passed into and out of the computation.
   constexpr int datalen = 15 * 1000;
@@ -259,7 +260,7 @@ XLA_TEST_F(VecOpsSimpleTest, Max15000ValuesFromParams) {
     expected_vec.push_back(larger);
   }
 
-  ComputationDataHandle v1, v2;
+  XlaOp v1, v2;
   std::unique_ptr<GlobalData> param0_data =
       CreateR1Parameter<float>(v1vec, /*parameter_number=*/0, /*name=*/"v1",
                                /*builder=*/&builder, /*data_handle=*/&v1);
@@ -274,7 +275,7 @@ XLA_TEST_F(VecOpsSimpleTest, Max15000ValuesFromParams) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesWithScalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
   auto y = builder.ConstantR0<float>(0);
@@ -286,7 +287,7 @@ XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesWithScalar) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, MinTenValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
   auto y = builder.ConstantR1<float>(
@@ -299,7 +300,7 @@ XLA_TEST_F(VecOpsSimpleTest, MinTenValues) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, MinMaxTenValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto zero = builder.ConstantR0<float>(0);
   auto one = builder.ConstantR0<float>(1);
   auto x = builder.ConstantR1<float>(
@@ -312,7 +313,7 @@ XLA_TEST_F(VecOpsSimpleTest, MinMaxTenValues) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstant) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto zero = builder.ConstantR0<float>(0);
   auto one = builder.ConstantR0<float>(1);
   auto x = builder.ConstantR1<float>(
@@ -325,7 +326,7 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstant) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, ClampTwoValuesConstant) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto zero = builder.ConstantR1<float>({0.0f, 0.0f});
   auto one = builder.ConstantR1<float>({1.0f, 1.0f});
   auto x = builder.ConstantR1<float>({2.1, -2.6});
@@ -336,7 +337,7 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTwoValuesConstant) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto one = builder.ConstantR0<float>(1);
   auto two = builder.ConstantR0<float>(2);
   auto x = builder.ConstantR1<float>(
@@ -360,10 +361,10 @@ XLA_TEST_F(VecOpsSimpleTest, ClampValuesConstantS64) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, MapTenValues) {
-  Computation add_half;
+  XlaComputation add_half;
   {
     // add_half(x) = x + 0.5
-    ComputationBuilder builder(client_, "add_half");
+    XlaBuilder builder("add_half");
     auto x_value =
         builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x_value");
     auto half = builder.ConstantR0<float>(0.5);
@@ -373,10 +374,10 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) {
     add_half = computation_status.ConsumeValueOrDie();
   }
 
-  Computation clamp;
+  XlaComputation clamp;
   {
     // clamp(y) = clamp<0,5>(y)
-    ComputationBuilder builder(client_, "clamp");
+    XlaBuilder builder("clamp");
     auto y_value =
         builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "y_value");
     auto zero = builder.ConstantR0<float>(0.0);
@@ -386,10 +387,10 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) {
     clamp = computation_status.ConsumeValueOrDie();
   }
 
-  Computation mult_relu_add;
+  XlaComputation mult_relu_add;
   {
     // mult_relu_add(z) = clamp(add_half(2 * max(z, 0)))
-    ComputationBuilder builder(client_, "mult_relu_add");
+    XlaBuilder builder("mult_relu_add");
     auto z_value =
         builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "z_value");
     auto zero = builder.ConstantR0<float>(0.0);
@@ -403,7 +404,7 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) {
     mult_relu_add = computation_status.ConsumeValueOrDie();
   }
 
-  ComputationBuilder builder(client_, "map10");
+  XlaBuilder builder("map10");
   {
     auto x = builder.ConstantR1<float>(
         {2.1, -21.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
@@ -416,7 +417,7 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, RemainderTenValuesS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<int32>({-5, -4, -3, -2, -1, 0, 1, 2, 3, 4});
   auto y = builder.ConstantR0<int32>(3);
   builder.Rem(x, y);
@@ -426,7 +427,7 @@ XLA_TEST_F(VecOpsSimpleTest, RemainderTenValuesS32) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, VectorPredicateEqual) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<bool>({false, true});
   auto y = builder.ConstantR1<bool>({true, false});
   builder.Eq(x, y);
@@ -436,7 +437,7 @@ XLA_TEST_F(VecOpsSimpleTest, VectorPredicateEqual) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, VectorPredicateNotEqual) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<bool>({false, true});
   auto y = builder.ConstantR1<bool>({true, false});
   builder.Ne(x, y);
-- 
GitLab


From eb31cf8a62739d4df4c84b8edeccbe756b70616d Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Wed, 25 Apr 2018 16:20:22 -0700
Subject: [PATCH 438/450] Checkpointable: better handling of objects which
 aren't being restored

initialize_or_restore on a tf.train.Checkpoint status object will now initialize
any variables which aren't being restored, which is closer to the behavior when
executing eagerly (and makes it easier to use).

Fixes a bug where assert_consumed() would miss some Python objects which aren't
part of the object graph being restored. It will now (correctly/as documented)
complain about unmatched Python objects in the dependency graph.

PiperOrigin-RevId: 194315742
---
 .../optimizer_v2/checkpointable_utils_test.py |  6 +-
 .../python/training/checkpointable_utils.py   | 85 +++++++++++++++----
 .../training/checkpointable_utils_test.py     | 85 +++++++++++++++++--
 3 files changed, 149 insertions(+), 27 deletions(-)

diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
index 8ac9b58145..9e2858d00f 100644
--- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -702,8 +702,7 @@ class CheckpointCompatibilityTests(test.TestCase):
       with save_graph.as_default(), self.test_session(
           graph=save_graph) as session:
         root = self._initialized_model()
-        object_saver = checkpointable_utils.CheckpointableSaver(root)
-        save_path = object_saver.save(
+        save_path = root.save(
             session=session, file_prefix=checkpoint_prefix)
     with context.eager_mode():
       root = self._initialized_model()
@@ -716,8 +715,7 @@ class CheckpointCompatibilityTests(test.TestCase):
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     with context.eager_mode():
       root = self._initialized_model()
-      object_saver = checkpointable_utils.CheckpointableSaver(root)
-      save_path = object_saver.save(file_prefix=checkpoint_prefix)
+      save_path = root.save(file_prefix=checkpoint_prefix)
     with context.graph_mode():
       save_graph = ops.Graph()
       with save_graph.as_default(), self.test_session(
diff --git a/tensorflow/python/training/checkpointable_utils.py b/tensorflow/python/training/checkpointable_utils.py
index 2a97b50fe7..9cdd53cbf9 100644
--- a/tensorflow/python/training/checkpointable_utils.py
+++ b/tensorflow/python/training/checkpointable_utils.py
@@ -341,19 +341,19 @@ def _serialize_object_graph(root_checkpointable):
       slot_variables=slot_variables)
 
 
-def gather_initializers(root_checkpointable):
-  """Traverse the object graph and find initialization ops.
+def list_objects(root_checkpointable):
+  """Traverse the object graph and list all accessible objects.
 
   Looks for `Checkpointable` objects which are dependencies of
-  `root_checkpointable` and which have an `initializer` property. Includes
-  initializers for slot variables only if the variable they are slotting for and
-  the optimizer are dependencies of `root_checkpointable` (i.e. if they would be
-  saved with a checkpoint).
+  `root_checkpointable`. Includes slot variables only if the variable they are
+  slotting for and the optimizer are dependencies of `root_checkpointable`
+  (i.e. if they would be saved with a checkpoint).
 
   Args:
-    root_checkpointable: A `Checkpointable` object to gather initializers for.
+    root_checkpointable: A `Checkpointable` object whose dependencies should be
+      flattened.
   Returns:
-    A list of initialization ops.
+    A flat list of objects.
   """
   # TODO(allenl): Extract out gathering logic so the naming logic doesn't have
   # to run.
@@ -368,6 +368,24 @@ def gather_initializers(root_checkpointable):
       checkpointable_objects=checkpointable_objects,
       node_ids=node_ids,
       object_names=object_names)
+  return checkpointable_objects
+
+
+def gather_initializers(root_checkpointable):
+  """Traverse the object graph and find initialization ops.
+
+  Looks for `Checkpointable` objects which are dependencies of
+  `root_checkpointable` and which have an `initializer` property. Includes
+  initializers for slot variables only if the variable they are slotting for and
+  the optimizer are dependencies of `root_checkpointable` (i.e. if they would be
+  saved with a checkpoint).
+
+  Args:
+    root_checkpointable: A `Checkpointable` object to gather initializers for.
+  Returns:
+    A list of initialization ops.
+  """
+  checkpointable_objects = list_objects(root_checkpointable)
   return [c.initializer for c in checkpointable_objects
           if hasattr(c, "initializer") and c.initializer is not None]
 
@@ -419,9 +437,10 @@ class CheckpointLoadStatus(_LoadStatus):
   See `Saver.restore` for usage examples.
   """
 
-  def __init__(self, checkpoint, feed_dict):
+  def __init__(self, checkpoint, feed_dict, root_checkpointable):
     self._checkpoint = checkpoint
     self._feed_dict = feed_dict
+    self._root_checkpointable = root_checkpointable
 
   def assert_consumed(self):
     """Asserts that all objects in the checkpoint have been created/matched.
@@ -451,6 +470,8 @@ class CheckpointLoadStatus(_LoadStatus):
           ("Unused attributes in these objects (the attributes exist in the "
            "checkpoint but not in the objects): %s") % (
                self._checkpoint.unused_attributes.items(),))
+    for checkpointable_object in list_objects(self._root_checkpointable):
+      self._checkpoint.all_python_objects.add(checkpointable_object)
     unused_python_objects = (
         set(self._checkpoint.all_python_objects)
         - set(self._checkpoint.object_by_proto_id.values()))
@@ -470,17 +491,35 @@ class CheckpointLoadStatus(_LoadStatus):
     session.run(self._checkpoint.restore_ops, feed_dict=self._feed_dict)
 
   def initialize_or_restore(self, session=None):
-    """Alias for `run_restore_ops`.
+    """Run operations to initialize or restore objects in the dependency graph.
+
+    Any objects in the dependency graph which have initializers but are not in
+    the checkpoint will have those initializers run, unless those variables are
+    being restored by a later call to `tf.train.Checkpoint.restore()`.
 
     This method has a sibling in `InitializationOnlyStatus` which instead
     initializes variables. That type is returned if no checkpoint is specified
     in `Saver.restore`.
 
     Args:
-      session: The session to run restore ops in. If `None`, uses the default
-        session.
+      session: The session to run init/restore ops in. If `None`, uses the
+        default session.
     """
+    if context.executing_eagerly():
+      return  # Initialization and restoration ops are run eagerly
+    if session is None:
+      session = ops.get_default_session()
+    all_objects = list_objects(self._root_checkpointable)
+    already_initialized_objects = set(
+        self._checkpoint.object_by_proto_id.values())
+    initializers_for_non_restored_variables = [
+        c.initializer for c in all_objects
+        if hasattr(c, "initializer")
+        and c not in already_initialized_objects
+        and (getattr(c, "_update_uid", self._checkpoint.restore_uid - 1)
+             < self._checkpoint.restore_uid)]
     self.run_restore_ops(session=session)
+    session.run(initializers_for_non_restored_variables)
 
 
 class InitializationOnlyStatus(_LoadStatus):
@@ -493,7 +532,8 @@ class InitializationOnlyStatus(_LoadStatus):
   otherwise.
   """
 
-  def __init__(self, root_checkpointable):
+  def __init__(self, root_checkpointable, restore_uid):
+    self._restore_uid = restore_uid
     self._root_checkpointable = root_checkpointable
 
   def assert_consumed(self):
@@ -517,8 +557,9 @@ class InitializationOnlyStatus(_LoadStatus):
   def initialize_or_restore(self, session=None):
     """Runs initialization ops for variables.
 
-    Only objects which would be saved by `Saver.save` will be initialized. See
-    `gather_initializers` for details.
+    Objects which would be saved by `Saver.save` will be initialized, unless
+    those variables are being restored by a later call to
+    `tf.train.Checkpoint.restore()`.
 
     This method does nothing when executing eagerly (initializers get run
     eagerly).
@@ -531,7 +572,13 @@ class InitializationOnlyStatus(_LoadStatus):
       return  # run eagerly
     if session is None:
       session = ops.get_default_session()
-    session.run(gather_initializers(self._root_checkpointable))
+    checkpointable_objects = list_objects(self._root_checkpointable)
+    initializers = [
+        c.initializer for c in checkpointable_objects
+        if hasattr(c, "initializer") and c.initializer is not None
+        and (getattr(c, "_update_uid", self._restore_uid - 1)
+             < self._restore_uid)]
+    session.run(initializers)
 
 
 _DEPRECATED_RESTORE_INSTRUCTIONS = (
@@ -787,7 +834,7 @@ class CheckpointableSaver(object):
       object is returned which runs restore ops from a name-based saver.
     """
     if save_path is None:
-      return InitializationOnlyStatus(self._root_checkpointable)
+      return InitializationOnlyStatus(self._root_checkpointable, ops.uid())
     in_graph_mode = not context.executing_eagerly()
     if in_graph_mode:
       if self._file_prefix_placeholder is None:
@@ -834,7 +881,9 @@ class CheckpointableSaver(object):
     checkpointable_lib._CheckpointPosition(  # pylint: disable=protected-access
         checkpoint=checkpoint, proto_id=0).restore(self._root_checkpointable)
     load_status = CheckpointLoadStatus(
-        checkpoint, feed_dict=file_prefix_feed_dict)
+        checkpoint,
+        root_checkpointable=self._root_checkpointable,
+        feed_dict=file_prefix_feed_dict)
     return load_status
 
 
diff --git a/tensorflow/python/training/checkpointable_utils_test.py b/tensorflow/python/training/checkpointable_utils_test.py
index 58e4b3cea5..40dfeb28d5 100644
--- a/tensorflow/python/training/checkpointable_utils_test.py
+++ b/tensorflow/python/training/checkpointable_utils_test.py
@@ -1117,6 +1117,84 @@ class CheckpointingTests(test.TestCase):
     self.assertAllEqual([1., 2., 3., 4., 5.],
                         self.evaluate(deferred_second_dense.bias))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_initialize_if_not_restoring(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    optimizer_only_prefix = os.path.join(checkpoint_directory, "opt")
+    with ops.Graph().as_default(), self.test_session(
+        graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      root = checkpointable_utils.Checkpoint(
+          model=model,  # Do not save the optimizer with the checkpoint.
+          global_step=training_util.get_or_create_global_step())
+      optimizer_checkpoint = checkpointable_utils.Checkpoint(
+          optimizer=optimizer)
+
+      checkpoint_path = saver_lib.latest_checkpoint(checkpoint_directory)
+      status = root.restore(save_path=checkpoint_path)
+      input_value = constant_op.constant([[3.]])
+      train_fn = functools.partial(
+          optimizer.minimize,
+          functools.partial(model, input_value),
+          global_step=root.global_step)
+      if not context.executing_eagerly():
+        train_fn = functools.partial(self.evaluate, train_fn())
+      status.initialize_or_restore()
+      self.evaluate([v.initializer for v in optimizer.variables()])
+      train_fn()
+      model_save_path = root.save(file_prefix=checkpoint_prefix)
+      self.evaluate(optimizer.variables()[0].assign(42.))
+      optimizer_save_path = optimizer_checkpoint.save(optimizer_only_prefix)
+
+    # Restore into a graph with the optimizer
+    with ops.Graph().as_default(), self.test_session(
+        graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      root = checkpointable_utils.Checkpoint(
+          optimizer=optimizer, model=model,
+          global_step=training_util.get_or_create_global_step())
+      status = root.restore(save_path=model_save_path)
+      input_value = constant_op.constant([[3.]])
+      train_fn = functools.partial(
+          optimizer.minimize,
+          functools.partial(model, input_value),
+          global_step=root.global_step)
+      if not context.executing_eagerly():
+        train_fn = functools.partial(self.evaluate, train_fn())
+      status.initialize_or_restore()
+      train_fn()
+      with self.assertRaises(AssertionError):
+        status.assert_consumed()
+
+    # Make sure initialization doesn't clobber later restores
+    with ops.Graph().as_default(), self.test_session(
+        graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001, beta1=1.0)
+      root = checkpointable_utils.Checkpoint(
+          optimizer=optimizer, model=model,
+          global_step=training_util.get_or_create_global_step())
+      opt_root = checkpointable_utils.Checkpoint(
+          optimizer=optimizer)
+      status = root.restore(save_path=model_save_path)
+      init_only_optimizer_status = opt_root.restore(save_path=None)
+      optimizer_status = opt_root.restore(save_path=optimizer_save_path)
+      input_value = constant_op.constant([[3.]])
+      train_fn = functools.partial(
+          optimizer.minimize,
+          functools.partial(model, input_value),
+          global_step=root.global_step)
+      if not context.executing_eagerly():
+        train_fn = functools.partial(self.evaluate, train_fn())
+      optimizer_status.run_restore_ops()
+      status.initialize_or_restore()
+      init_only_optimizer_status.initialize_or_restore()
+      train_fn()
+      self.assertEqual(42., self.evaluate(optimizer.variables()[0]))
+
 
 class TemplateTests(test.TestCase):
 
@@ -1279,9 +1357,7 @@ class CheckpointCompatibilityTests(test.TestCase):
       with save_graph.as_default(), self.test_session(
           graph=save_graph) as session:
         root = self._initialized_model()
-        object_saver = checkpointable_utils.CheckpointableSaver(root)
-        save_path = object_saver.save(
-            session=session, file_prefix=checkpoint_prefix)
+        save_path = root.save(session=session, file_prefix=checkpoint_prefix)
     with context.eager_mode():
       root = self._initialized_model()
       self._set_sentinels(root)
@@ -1293,8 +1369,7 @@ class CheckpointCompatibilityTests(test.TestCase):
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     with context.eager_mode():
       root = self._initialized_model()
-      object_saver = checkpointable_utils.CheckpointableSaver(root)
-      save_path = object_saver.save(file_prefix=checkpoint_prefix)
+      save_path = root.save(file_prefix=checkpoint_prefix)
     with context.graph_mode():
       save_graph = ops.Graph()
       with save_graph.as_default(), self.test_session(
-- 
GitLab


From a88ef07ef2fbae521bd7e914bfda1428a87be934 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Wed, 25 Apr 2018 16:22:29 -0700
Subject: [PATCH 439/450] Various small cleanups and simplifications

PiperOrigin-RevId: 194316037
---
 tensorflow/core/grappler/costs/graph_properties.cc          | 6 ++----
 tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc | 3 +--
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index e3c6c40306..313f63149d 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -158,7 +158,7 @@ struct Processor<DimensionHandle> {
 template <typename Handle>
 class DisjointSet {
  public:
-  DisjointSet(const Processor<Handle>& processor) : processor_(processor) {}
+  DisjointSet() {}
   ~DisjointSet() {
     for (auto rep : nodes_) {
       delete rep.second;
@@ -840,7 +840,7 @@ class SymbolicShapeRefiner {
 // dims, and consolidate the information globally.
 class SymbolicShapeManager {
  public:
-  SymbolicShapeManager() : shapes_(shape_processor_), dims_(dim_processor_) {}
+  SymbolicShapeManager() {}
 
   Status Merge(ShapeHandle s1, ShapeHandle s2) {
     if (!s1.IsSet() || !s2.IsSet()) {
@@ -880,9 +880,7 @@ class SymbolicShapeManager {
   }
 
  private:
-  Processor<ShapeHandle> shape_processor_;
   DisjointSet<shape_inference::ShapeHandle> shapes_;
-  Processor<DimensionHandle> dim_processor_;
   DisjointSet<shape_inference::DimensionHandle> dims_;
 };
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 866b993e93..c0bd0bda95 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -208,8 +208,7 @@ bool ReshapeIsIdentity(const NodeDef& reshape, const NodeDef& input,
       graph_properties.GetOutputProperties(reshape.name());
   const std::vector<OpInfo::TensorProperties>& input_props =
       graph_properties.GetOutputProperties(input.name());
-  if (reshape_props.empty() || input_props.empty() ||
-      input_props.size() <= output_pos) {
+  if (reshape_props.empty() || input_props.size() <= output_pos) {
     return false;
   }
 
-- 
GitLab


From 68f8c6acbc58fe19ecc7808d707d655579cd71a8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 16:31:14 -0700
Subject: [PATCH 440/450] [XLA] Redesign: local_client_test_base and tests that
 depend on it.

PiperOrigin-RevId: 194317245
---
 tensorflow/compiler/xla/tests/BUILD           |  2 +-
 .../xla/tests/local_client_allocation_test.cc |  7 +-
 .../xla/tests/local_client_execute_test.cc    | 67 +++++++++----------
 .../xla/tests/local_client_test_base.cc       |  8 +--
 .../xla/tests/local_client_test_base.h        | 10 +--
 .../compiler/xla/tests/test_utils_test.cc     |  4 +-
 6 files changed, 48 insertions(+), 50 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index aec926d44b..c28d14ba8a 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -260,8 +260,8 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
diff --git a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
index 7209f91639..f21f83992f 100644
--- a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
@@ -15,9 +15,8 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
@@ -38,7 +37,7 @@ class LocalClientAllocationTest : public LocalClientTestBase {
 };
 
 XLA_TEST_F(LocalClientAllocationTest, AddVectors) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>({0.0f, 1.0f, 2.0f});
   auto y = builder.ConstantR1<float>({2.0f, 3.0f, 4.0f});
   builder.Add(x, y);
@@ -74,7 +73,7 @@ XLA_TEST_F(LocalClientAllocationTest, AddVectors) {
 XLA_TEST_F(LocalClientAllocationTest, RunOnDevices) {
   // Run a computation on every device on the system. Verify that allocation
   // occurs on the proper device.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>({0.0f, 1.0f, 2.0f});
   auto y = builder.ConstantR1<float>({2.0f, 3.0f, 4.0f});
   builder.Add(x, y);
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index 26c33f7725..44c6811df8 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -18,9 +18,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -54,7 +53,7 @@ class LocalClientExecuteTest : public LocalClientTestBase {
 };
 
 XLA_TEST_F(LocalClientExecuteTest, Constant) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto y = builder.ConstantR0<float>(123.0f);
 
   ScopedShapedBuffer result =
@@ -64,7 +63,7 @@ XLA_TEST_F(LocalClientExecuteTest, Constant) {
 }
 
 XLA_TEST_F(LocalClientExecuteTest, AddScalars) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
   auto y = builder.ConstantR0<float>(123.0f);
   builder.Add(x, y);
@@ -77,7 +76,7 @@ XLA_TEST_F(LocalClientExecuteTest, AddScalars) {
 }
 
 XLA_TEST_F(LocalClientExecuteTest, AddZeroElementVectors) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {0}), "x");
   auto y = builder.ConstantR1<float>({});
   builder.Add(x, y);
@@ -90,7 +89,7 @@ XLA_TEST_F(LocalClientExecuteTest, AddZeroElementVectors) {
 }
 
 XLA_TEST_F(LocalClientExecuteTest, AddVectors) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x");
   auto y = builder.ConstantR1<float>({2.0f, 3.0f, 4.0f});
   builder.Add(x, y);
@@ -104,7 +103,7 @@ XLA_TEST_F(LocalClientExecuteTest, AddVectors) {
 }
 
 XLA_TEST_F(LocalClientExecuteTest, AddVectorsWithProfile) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x");
   auto y = builder.ConstantR1<float>({2.0f, 3.0f, 4.0f});
   builder.Add(x, y);
@@ -122,7 +121,7 @@ XLA_TEST_F(LocalClientExecuteTest, AddVectorsWithProfile) {
 }
 
 XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentInputLayouts) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x");
   auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y");
   builder.Add(x, y);
@@ -155,7 +154,7 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentInputLayouts) {
 }
 
 XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentOutputLayouts) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x");
   auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y");
   builder.Add(x, y);
@@ -192,7 +191,7 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentOutputLayouts) {
 }
 
 XLA_TEST_F(LocalClientExecuteTest, TupleResult) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x");
   auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y");
   builder.Tuple({x, y, x});
@@ -220,7 +219,7 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResult) {
 }
 
 XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x");
   auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y");
   auto inner_tuple = builder.Tuple({x, y, x});
@@ -254,7 +253,7 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) {
 
 XLA_TEST_F(LocalClientExecuteTest, TupleResultWithLayout) {
   // Verify setting the result layout of a computation with a tuple output.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x");
   auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y");
   builder.Tuple({x, y});
@@ -291,7 +290,7 @@ XLA_TEST_F(LocalClientExecuteTest, TupleArguments) {
 
   // Computation adds the respective array and vector elements from each tuple
   // argument and returns the results as a tuple.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, tuple_shape0, "x");
   auto y = builder.Parameter(1, tuple_shape1, "y");
   auto x_0 = builder.GetTupleElement(x, 0);
@@ -338,7 +337,7 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleArgument) {
 
   // Computation negates the array element and sums the two vector elements in
   // the nested tuple. The resulting array and vector are returned as a tuple.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto param = builder.Parameter(0, nested_tuple_shape, "param");
   auto inner_tuple = builder.GetTupleElement(param, 0);
   auto inner_array = builder.GetTupleElement(inner_tuple, 0);
@@ -376,7 +375,7 @@ XLA_TEST_F(LocalClientExecuteTest, PassingTupleResultBackIntoComputation) {
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({array_shape, array_shape});
 
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto param = builder.Parameter(0, tuple_shape, "param");
   auto element_0 = builder.GetTupleElement(param, 0);
   auto element_1 = builder.GetTupleElement(param, 1);
@@ -420,11 +419,11 @@ XLA_TEST_F(LocalClientExecuteTest, LargeTuple) {
   std::vector<Shape> element_shapes(kElementCount, element_shape);
   const Shape tuple_shape = ShapeUtil::MakeTupleShape(element_shapes);
 
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto param = builder.Parameter(0, tuple_shape, "param");
 
   // Add each element's tuple index value to every element.
-  std::vector<ComputationDataHandle> result_elements;
+  std::vector<XlaOp> result_elements;
   for (int i = 0; i < kElementCount; ++i) {
     auto element = builder.GetTupleElement(param, i);
     result_elements.push_back(
@@ -465,15 +464,15 @@ XLA_TEST_F(LocalClientExecuteTest, LargeNestedTuple) {
   std::vector<Shape> inner_tuple_shapes(kFanout, inner_tuple_shape);
   const Shape tuple_shape = ShapeUtil::MakeTupleShape(inner_tuple_shapes);
 
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto param = builder.Parameter(0, tuple_shape, "param");
 
   // The computation increments each leaf value by an amount equal to the leaf's
   // ordinal position in a traversal of the tuple.
-  std::vector<ComputationDataHandle> result_elements;
+  std::vector<XlaOp> result_elements;
   for (int i = 0; i < kFanout; ++i) {
     auto outer_element = builder.GetTupleElement(param, i);
-    std::vector<ComputationDataHandle> inner_result_elements;
+    std::vector<XlaOp> inner_result_elements;
     for (int j = 0; j < kFanout; ++j) {
       auto inner_element = builder.GetTupleElement(outer_element, j);
       inner_result_elements.push_back(builder.Add(
@@ -520,7 +519,7 @@ XLA_TEST_F(LocalClientExecuteTest, DeepTuple) {
     shape = ShapeUtil::MakeTupleShape({shape});
   }
 
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto element = builder.Parameter(0, shape, "param");
   for (int i = 0; i < kTupleDepth; ++i) {
     element = builder.GetTupleElement(element, 0);
@@ -554,7 +553,7 @@ XLA_TEST_F(LocalClientExecuteTest, DeepTuple) {
 
 XLA_TEST_F(LocalClientExecuteTest, InvalidNumberOfArguments) {
   // Test passing in an invalid number of arguments.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x");
   auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {3}), "y");
   builder.Add(x, y);
@@ -571,7 +570,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidNumberOfArguments) {
 
 XLA_TEST_F(LocalClientExecuteTest, IncorrectArgumentShape) {
   // Test passing in an argument with the wrong shape.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x");
   builder.Neg(x);
 
@@ -588,7 +587,7 @@ XLA_TEST_F(LocalClientExecuteTest, IncorrectArgumentShape) {
 
 XLA_TEST_F(LocalClientExecuteTest, InvalidResultLayout) {
   // Test passing in an invalid result layout parameter.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x");
   builder.Neg(x);
 
@@ -611,7 +610,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidResultLayout) {
 XLA_TEST_F(LocalClientExecuteTest, RunOnAllDeviceOrdinals) {
   // Try to run a trivial computation on every device on the system. If a
   // specific device is not supported, check that the right error is returned.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR0<float>(42.0f);
   auto computation = builder.Build().ConsumeValueOrDie();
   for (int d = 0; d < local_client_->device_count(); ++d) {
@@ -638,7 +637,7 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnAllDeviceOrdinals) {
 XLA_TEST_F(LocalClientExecuteTest, InvalidDeviceOrdinalValues) {
   // Try running computations on devices with device ordinal values which do not
   // exist.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR0<float>(42.0f);
   auto computation = builder.Build().ConsumeValueOrDie();
 
@@ -655,7 +654,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidDeviceOrdinalValues) {
 
 XLA_TEST_F(LocalClientExecuteTest, RunOnStream) {
   // Run a computation on a specific stream on each device on the system.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR0<float>(42.0f);
   auto computation = builder.Build().ConsumeValueOrDie();
 
@@ -691,7 +690,7 @@ XLA_TEST_F(LocalClientExecuteTest,
   se::Stream wrong_stream(wrong_platform->ExecutorForDevice(0).ValueOrDie());
   wrong_stream.Init();
 
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR0<float>(42.0f);
   auto execute_status = ExecuteLocally(
       builder.Build().ValueOrDie(), {}, DefaultExecutableBuildOptions(),
@@ -708,7 +707,7 @@ XLA_TEST_F(LocalClientExecuteTest,
           .ValueOrDie();
   TestAllocator allocator(wrong_platform);
 
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto y = builder.ConstantR0<float>(123.0f);
 
   auto execute_status = ExecuteLocally(
@@ -721,7 +720,7 @@ XLA_TEST_F(LocalClientExecuteTest,
 
 XLA_TEST_F(LocalClientExecuteTest, RunOnUninitializedStream) {
   // Try to run a computation on a stream that has not been initialized.
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR0<float>(42.0f);
 
   LOG(INFO) << "default device = " << local_client_->default_device_ordinal();
@@ -741,7 +740,7 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnUninitializedStream) {
 }
 
 XLA_TEST_F(LocalClientExecuteTest, SelectBetweenTuples) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::initializer_list<float> vec1 = {1.f, 2.f, 3.f};
   std::initializer_list<float> vec2 = {2.f, 4.f, 6.f};
@@ -761,7 +760,7 @@ XLA_TEST_F(LocalClientExecuteTest, SelectBetweenTuples) {
 }
 
 XLA_TEST_F(LocalClientExecuteTest, CompileExecutable) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x");
   auto y = builder.ConstantR1<float>({2.0f, 3.0f, 4.0f});
   builder.Add(x, y);
@@ -852,7 +851,7 @@ XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion64bit) {
 // TODO(b/34359662): Support infeed/outfeed on GPU and CPU parallel.
 // 2017-10-18.
 XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_GPU(InfeedOutfeedTest)) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   const Shape shape = ShapeUtil::MakeShape(F32, {3});
   auto in = builder.Infeed(shape);
   auto constant = builder.ConstantR1<float>({1.0f, 2.0f, 3.0f});
@@ -890,7 +889,7 @@ void BM_LocalClientOverhead(int num_iters) {
   int device_ordinal = client->default_device_ordinal();
 
   // Use a tiny add operation as the computation.
-  ComputationBuilder builder(client, "Add");
+  XlaBuilder builder("Add");
   auto shape = ShapeUtil::MakeShape(F32, {2, 3});
   auto x = builder.Parameter(0, shape, "x");
   builder.Add(x, x);
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index b615f0fead..ca8e4cdbdb 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -157,7 +157,7 @@ ExecutableRunOptions LocalClientTestBase::DefaultExecutableRunOptions() const {
 }
 
 ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie(
-    const Computation& computation,
+    const XlaComputation& computation,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   return ExecuteLocally(computation, arguments, DefaultExecutableBuildOptions(),
                         DefaultExecutableRunOptions())
@@ -165,7 +165,7 @@ ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie(
 }
 
 ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie(
-    const Computation& computation,
+    const XlaComputation& computation,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     const ExecutableBuildOptions& build_options,
     const ExecutableRunOptions& run_options) {
@@ -174,14 +174,14 @@ ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie(
 }
 
 StatusOr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocally(
-    const Computation& computation,
+    const XlaComputation& computation,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   return ExecuteLocally(computation, arguments, DefaultExecutableBuildOptions(),
                         DefaultExecutableRunOptions());
 }
 
 StatusOr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocally(
-    const Computation& computation,
+    const XlaComputation& computation,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     const ExecutableBuildOptions& build_options,
     const ExecutableRunOptions& run_options) {
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h
index 4ee56a05ec..3bbb760c80 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@@ -21,8 +21,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
@@ -93,19 +93,19 @@ class LocalClientTestBase : public ::testing::Test {
   // Execute the given computation on the local client. With and without
   // options.
   StatusOr<ScopedShapedBuffer> ExecuteLocally(
-      const Computation& computation,
+      const XlaComputation& computation,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments);
   StatusOr<ScopedShapedBuffer> ExecuteLocally(
-      const Computation& computation,
+      const XlaComputation& computation,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       const ExecutableBuildOptions& build_options,
       const ExecutableRunOptions& run_options);
 
   ScopedShapedBuffer ExecuteLocallyOrDie(
-      const Computation& computation,
+      const XlaComputation& computation,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments);
   ScopedShapedBuffer ExecuteLocallyOrDie(
-      const Computation& computation,
+      const XlaComputation& computation,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       const ExecutableBuildOptions& build_options,
       const ExecutableRunOptions& run_options);
diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc
index e8efc6e2a8..59afd28a80 100644
--- a/tensorflow/compiler/xla/tests/test_utils_test.cc
+++ b/tensorflow/compiler/xla/tests/test_utils_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/local_client_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -28,7 +28,7 @@ namespace {
 class TestUtilsTest : public LocalClientTestBase {};
 
 XLA_TEST_F(TestUtilsTest, UnusedParam) {
-  ComputationBuilder builder(local_client_, TestName());
+  XlaBuilder builder(TestName());
   // Make the reduction lambda.
   Shape single_float = ShapeUtil::MakeShape(F32, {});
   builder.Parameter(0, single_float, "unused");
-- 
GitLab


From 2b570e00f1876d4da733edd41e854878d34b0469 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 16:37:02 -0700
Subject: [PATCH 441/450] Automated g4 rollback of changelist 194268101

PiperOrigin-RevId: 194318022
---
 tensorflow/core/platform/default/build_config.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index ec3bbab1fd..ca0587e277 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -575,11 +575,11 @@ def tf_additional_lib_defines():
       "//tensorflow:with_jemalloc_linux_x86_64": ["TENSORFLOW_USE_JEMALLOC"],
       "//tensorflow:with_jemalloc_linux_ppc64le":["TENSORFLOW_USE_JEMALLOC"],
       "//conditions:default": [],
-  }) + ["TENSORFLOW_USE_ABSL"]
+  }) + if_not_mobile(["TENSORFLOW_USE_ABSL"])
 
 def tf_additional_lib_deps():
   """Additional dependencies needed to build TF libraries."""
-  return ["@com_google_absl//absl/base:base"] + if_static(
+  return if_not_mobile(["@com_google_absl//absl/base:base"]) + if_static(
       ["@nsync//:nsync_cpp"],
       ["@nsync//:nsync_headers"]
   ) + select({
-- 
GitLab


From ca634912e9b121d2e6b2ea04084886c73993e6aa Mon Sep 17 00:00:00 2001
From: Akshay Agrawal <akshayka@google.com>
Date: Wed, 25 Apr 2018 17:59:01 -0700
Subject: [PATCH 442/450] Preserve guarantees about const-ness when creating
 TensorFlow functions.

PiperOrigin-RevId: 194328218
---
 tensorflow/python/framework/function.py      | 57 +++++++++++++++++++-
 tensorflow/python/framework/function_test.py | 23 ++++++++
 2 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index f343edc483..2432ab378c 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -715,7 +715,11 @@ class _FuncGraph(ops.Graph):
     # pylint: enable=protected-access
     self._captured[tensor] = ph
     self.extra_args.append(ph)
-    return ph
+    if _is_guaranteed_const(tensor):
+      with ops.control_dependencies(None):
+        return array_ops.guarantee_const(ph)
+    else:
+      return ph
 
   def _add_tensor_and_parents(self, tensor):
     op = self._add_op_and_parents(tensor.op)
@@ -747,6 +751,57 @@ class _FuncGraph(ops.Graph):
     return captured_op
 
 
+def _is_guaranteed_const(tensor):
+  """Determines whether `tensor` is guaranteed to be a constant.
+
+  A tensor is guaranteed to be a constant if either it was produced by
+  a `GuaranteeConst` op or if all of its children are guaranteed to be
+  constants.
+
+  Args:
+    tensor: The tensor for which to determine const-ness.
+
+  Returns:
+    True if `tensor` is guaranteed to be a constant, False otherwise.
+  """
+
+  if isinstance(tensor, ops.EagerTensor):
+    return False
+
+  class Work(object):
+
+    def __init__(self, op, leaving):
+      self.op = op
+      self.leaving = leaving
+
+  is_guaranteed_const = lambda op: op.node_def.op == "GuaranteeConst"
+  constants = set([])
+  def all_inputs_const(op):
+    # If all inputs of an op are guaranteed constants, then we can infer that
+    # the op produces a constant as well.
+    return op.inputs and all(inp.op in constants for inp in op.inputs)
+
+  visited = set([])
+  stack = [Work(tensor.op, leaving=False)]
+  while stack:
+    work = stack.pop()
+    if work.leaving:
+      if all_inputs_const(work.op):
+        constants.add(work.op)
+      continue
+    visited.add(work.op)
+    if is_guaranteed_const(work.op):
+      constants.add(work.op)
+      continue
+
+    # This op will be revisited after all its inputs are checked for const-ness.
+    stack.append(Work(work.op, leaving=True))
+    for inp in work.op.inputs:
+      if inp.op not in visited:
+        stack.append(Work(inp.op, leaving=False))
+  return tensor.op in constants
+
+
 def _call(sig, *inputs, **kwargs):
   """Adds a node calling a function.
 
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index cfdacee54f..594596ec1e 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -1050,6 +1050,29 @@ class FunctionTest(test.TestCase):
         self.assertEqual(44.0, sess.run(f_1))
         self.assertEqual((42.0, 44.0), sess.run((f_0, f_1)))
 
+  def testGuaranteedConstsAreCaptured(self):
+    var = variables.Variable(1.0)
+    const = array_ops.guarantee_const(var)
+    also_const = array_ops.identity(const)
+    still_const = array_ops.identity(also_const)
+    not_const = still_const + var
+    also_not_const = array_ops.placeholder(dtypes.float32)
+
+    @function.Defun()
+    def CapturesGuaranteedConst():
+      output = const + also_const + still_const + not_const + also_not_const
+      first, second, third, fourth, fifth = function.get_extra_args()
+      self.assertEqual("GuaranteeConst", first.consumers()[0].node_def.op)
+      self.assertEqual("GuaranteeConst", second.consumers()[0].node_def.op)
+      self.assertEqual("GuaranteeConst", third.consumers()[0].node_def.op)
+      self.assertNotEqual("GuaranteeConst", fourth.consumers()[0].node_def.op)
+      self.assertNotEqual("GuaranteeConst", fifth.consumers()[0].node_def.op)
+      return output
+
+    with self.test_session(use_gpu=False) as sess:
+      sess.run(var.initializer)
+      _ = sess.run(CapturesGuaranteedConst(), {also_not_const: 1.0})
+
 
 @test_util.with_c_shapes
 class FunctionsFromProtos(test.TestCase):
-- 
GitLab


From 10ea32657868f0ef60cb583d64abaea389a67a68 Mon Sep 17 00:00:00 2001
From: Francois Chollet <>
Date: Tue, 24 Apr 2018 17:00:40 -0700
Subject: [PATCH 443/450] Fix critical metrics computation bug with Model in
 Eager mode.

---
 tensorflow/python/keras/_impl/keras/engine/training_eager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
index ad239d6151..34adeb7599 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
@@ -96,7 +96,7 @@ def _eager_metrics_fn(model, outputs, targets):
           model.metrics_names.append(metric_name)
 
       with backend.name_scope(metric_name):
-        metric_result = metric_fn(outputs[i], targets[i])
+        metric_result = metric_fn(targets[i], outputs[i])
         metric_names.append(metric_name)
         metric_results.append(backend.mean(metric_result))
 
-- 
GitLab


From 270a6e925493b6c2219b7a0152f6b81fbb88dfee Mon Sep 17 00:00:00 2001
From: James Qin <jamesqin@google.com>
Date: Wed, 25 Apr 2018 19:00:21 -0700
Subject: [PATCH 444/450] Cudnn RNN v2 kernels with autotune capability

CudnnRNN V2 kernels run all applicable cudnn rnn algorithms and pick the best one for following runs.
* To enable autotune, TF_CUDNN_RNN_USE_AUTOTUNE and TF_CUDNN_RNN_USE_V2 need to be set to {"1" or unset}.
* TF_CUDNN_RNN_USE_AUTOTUNE does not work with existing CudnnRNN kernels.
* V2 kernels work with existing cudnn checkpoints, since it doesn't change persistence format.

This change
* Introduces v2 kernels as templates inheriting the v1 kernels.
* Profiles fwd and bak runs in v2 kernel (forward pass)
* Exposes the chosen algorithm as fwd op output and bak op input.
* Changes rnn descriptor cache key to include AlgorithmDesc (since cudnn rnn descriptor can't be reused across different algorithms)
* Updates unittests s.t. it tests both v1 and v2 kernels. When testing v2 kernels, autotune is turned on.

PiperOrigin-RevId: 194333948
---
 .../python/kernel_tests/cudnn_rnn_test.py     |  32 +-
 .../cudnn_rnn/python/ops/cudnn_rnn_ops.py     |  35 +-
 .../api_def/base_api/api_def_CudnnRNN.pbtxt   |  26 +-
 .../base_api/api_def_CudnnRNNBackprop.pbtxt   |  24 +-
 .../base_api/api_def_CudnnRNNBackpropV2.pbtxt |  49 ++
 .../api_def/base_api/api_def_CudnnRNNV2.pbtxt |  40 ++
 tensorflow/core/kernels/BUILD                 |   1 +
 tensorflow/core/kernels/cudnn_rnn_ops.cc      | 453 ++++++++++++++++--
 tensorflow/core/ops/cudnn_rnn_ops.cc          |  79 +++
 tensorflow/core/ops/cudnn_rnn_ops_test.cc     |  35 +-
 tensorflow/core/util/use_cudnn.cc             |  46 +-
 tensorflow/core/util/use_cudnn.h              |  13 +-
 tensorflow/python/ops/cudnn_rnn_grad.py       |  28 +-
 tensorflow/stream_executor/cuda/cuda_dnn.cc   |  78 ++-
 tensorflow/stream_executor/cuda/cuda_dnn.h    |   2 +-
 tensorflow/stream_executor/dnn.cc             |   5 +
 tensorflow/stream_executor/dnn.h              |   3 +-
 .../stream_executor/stream_executor_pimpl.cc  |   7 +-
 .../stream_executor/stream_executor_pimpl.h   |   2 +-
 19 files changed, 830 insertions(+), 128 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_CudnnRNNBackpropV2.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_CudnnRNNV2.pbtxt

diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index 6fb56b0858..012b17cee8 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -1072,6 +1072,17 @@ class CudnnRNNTestParamsSize(test_util.TensorFlowTestCase):
 
 class CudnnRNNTestTraining(test_util.TensorFlowTestCase):
 
+  def setUp(self):
+    super(CudnnRNNTestTraining, self).setUp()
+    self._reset_rnd_gen_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE",
+                                               str(False))
+    self._rnn_use_v2 = os.environ.get("TF_CUDNN_RNN_USE_V2", "0")
+
+  def tearDown(self):
+    super(CudnnRNNTestTraining, self).tearDown()
+    os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = self._reset_rnd_gen_state
+    os.environ["TF_CUDNN_RNN_USE_V2"] = self._rnn_use_v2
+
   def _ComputeNumericGrad(self, sess, y, x, delta=1e-4, step=1):
     """Compute the numeric gradient of y wrt to x.
 
@@ -1184,11 +1195,10 @@ class CudnnRNNTestTraining(test_util.TensorFlowTestCase):
 
   def _TestOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size,
                              batch_size, seq_length, dir_count, dropout, dtype,
-                             delta, tolerance):
+                             use_v2, delta, tolerance):
     # Gradient checking runs two forward ops with almost the same input. Need to
     # make sure the drop patterns across the two runs are the same.
     logging.info("Training test with config: %s", locals())
-    old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False))
     os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True)
 
     np.random.seed(1234)
@@ -1196,6 +1206,10 @@ class CudnnRNNTestTraining(test_util.TensorFlowTestCase):
     has_input_c = (rnn_mode == CUDNN_LSTM)
     direction = (CUDNN_RNN_UNIDIRECTION
                  if dir_count == 1 else CUDNN_RNN_BIDIRECTION)
+    if use_v2:
+      os.environ["TF_CUDNN_RNN_USE_V2"] = "1"
+    else:
+      os.environ["TF_CUDNN_RNN_USE_V2"] = "0"
     model = CudnnTestModel(
         rnn_mode,
         num_layers,
@@ -1245,22 +1259,22 @@ class CudnnRNNTestTraining(test_util.TensorFlowTestCase):
           self._GradientCheck(
               sess, total_sum, all_inputs,
               tolerance=tolerance, delta=delta)
-      os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state
 
   def _TestSimpleTrainingHelper(self, rnn_mode, test_configs):
     dropouts = [0, 0.5, 1.]
-    for config, dropout in itertools.product(test_configs, dropouts):
+    v2_options = [str(False), str(True)]
+    for config, dropout, use_v2 in itertools.product(test_configs, dropouts,
+                                                     v2_options):
       dtype = config.get("dtype", dtypes.float32)
       delta = config.get("delta", 1e-4)
       tolerance = config.get("tolerance", 1e-6)
       dir_count = config.get("dir_count", 1)
       shape = config["shape"]
       with ops.Graph().as_default():
-        self._TestOneSimpleTraining(rnn_mode, shape["num_layers"],
-                                    shape["num_units"], shape["input_size"],
-                                    shape["batch_size"], shape["seq_length"],
-                                    dir_count, dropout, dtype, delta,
-                                    tolerance)
+        self._TestOneSimpleTraining(
+            rnn_mode, shape["num_layers"], shape["num_units"],
+            shape["input_size"], shape["batch_size"], shape["seq_length"],
+            dir_count, dropout, dtype, use_v2, delta, tolerance)
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index a1ede4471e..73a961992e 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 from tensorflow.contrib.checkpoint.python import split_dependency
 from tensorflow.contrib.rnn.python.ops import lstm_ops
 from tensorflow.python.framework import common_shapes
@@ -901,19 +902,27 @@ def _cudnn_rnn(inputs,
   check_direction(direction)
   check_input_mode(input_mode)
   seed, seed2 = random_seed.get_seed(seed)
-  outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
-      input=inputs,
-      input_h=input_h,
-      input_c=input_c,
-      params=params,
-      is_training=is_training,
-      rnn_mode=rnn_mode,
-      input_mode=input_mode,
-      direction=direction,
-      dropout=dropout,
-      seed=seed,
-      seed2=seed2,
-      name=name)
+  # TODO(jamesqin): switch default value to "1" on May 25th 2018, and get rid
+  # of V1 ops.
+  use_cudnn_v2 = os.environ.get("TF_CUDNN_RNN_USE_V2", "0")
+  args = {
+      "input": inputs,
+      "input_h": input_h,
+      "input_c": input_c,
+      "params": params,
+      "is_training": is_training,
+      "rnn_mode": rnn_mode,
+      "input_mode": input_mode,
+      "direction": direction,
+      "dropout": dropout,
+      "seed": seed,
+      "seed2": seed2,
+      "name": name
+  }
+  if use_cudnn_v2 is not "1":
+    outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(**args)
+  else:
+    outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv2(**args)
   return (outputs, output_h, output_c)
 
 
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNN.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNN.pbtxt
index daeb5fe9a2..461b498662 100644
--- a/tensorflow/core/api_def/base_api/api_def_CudnnRNN.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNN.pbtxt
@@ -7,30 +7,30 @@ buffer.
 
 rnn_mode: Indicates the type of the RNN model.
 input_mode: Indicate whether there is a linear projection between the input and
-  The actual computation before the first layer. 'skip_input' is only allowed
+  the actual computation before the first layer. 'skip_input' is only allowed
   when input_size == num_units; 'auto_select' implies 'skip_input' when
   input_size == num_units; otherwise, it implies 'linear_input'.
-direction: Indicates whether a bidirectional model will be used.
-  dir = (direction == bidirectional) ? 2 : 1
-dropout: dropout probability. When set to 0., dropout is disabled.
-seed: the 1st part of a seed to initialize dropout.
-seed2: the 2nd part of a seed to initialize dropout.
-input: a 3-D tensor with the shape of [seq_length, batch_size, input_size].
-input_h: a 3-D tensor with the shape of [num_layer * dir, batch_size,
+direction: Indicates whether a bidirectional model will be used. Should be
+  "unidirectional" or "bidirectional".
+dropout: Dropout probability. When set to 0., dropout is disabled.
+seed: The 1st part of a seed to initialize dropout.
+seed2: The 2nd part of a seed to initialize dropout.
+input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
     num_units].
 input_c: For LSTM, a 3-D tensor with the shape of
     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-params: a 1-D tensor that contains the weights and biases in an opaque layout.
+params: A 1-D tensor that contains the weights and biases in an opaque layout.
     The size must be created through CudnnRNNParamsSize, and initialized
     separately. Note that they might not be compatible across different
     generations. So it is a good idea to save and restore
-output: a 3-D tensor with the shape of [seq_length, batch_size,
+output: A 3-D tensor with the shape of [seq_length, batch_size,
     dir * num_units].
-output_h: the same shape has input_h.
-output_c: the same shape as input_c for LSTM. An empty tensor for other models.
+output_h: The same shape has input_h.
+output_c: The same shape as input_c for LSTM. An empty tensor for other models.
 is_training: Indicates whether this operation is used for inferenece or
   training.
-reserve_space: an opaque tensor that can be used in backprop calculation. It
+reserve_space: An opaque tensor that can be used in backprop calculation. It
   is only produced if is_training is false.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackprop.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackprop.pbtxt
index 075ec52648..7cd5ae637b 100644
--- a/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackprop.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackprop.pbtxt
@@ -6,27 +6,27 @@ Compute the backprop of both data and weights in a RNN.
 
 rnn_mode: Indicates the type of the RNN model.
 input_mode: Indicate whether there is a linear projection between the input and
-    The actual computation before the first layer. 'skip_input' is only allowed
+    the actual computation before the first layer. 'skip_input' is only allowed
     when input_size == num_units; 'auto_select' implies 'skip_input' when
     input_size == num_units; otherwise, it implies 'linear_input'.
-direction: Indicates whether a bidirectional model will be used.
-    dir = (direction == bidirectional) ? 2 : 1
-dropout: dropout probability. When set to 0., dropout is disabled.
-seed: the 1st part of a seed to initialize dropout.
-seed2: the 2nd part of a seed to initialize dropout.
-input: a 3-D tensor with the shape of [seq_length, batch_size, input_size].
-input_h: a 3-D tensor with the shape of [num_layer * dir, batch_size,
+direction: Indicates whether a bidirectional model will be used. Should be
+  "unidirectional" or "bidirectional".
+dropout: Dropout probability. When set to 0., dropout is disabled.
+seed: The 1st part of a seed to initialize dropout.
+seed2: The 2nd part of a seed to initialize dropout.
+input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
     num_units].
 input_c: For LSTM, a 3-D tensor with the shape of
     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-params: a 1-D tensor that contains the weights and biases in an opaque layout.
+params: A 1-D tensor that contains the weights and biases in an opaque layout.
     The size must be created through CudnnRNNParamsSize, and initialized
     separately. Note that they might not be compatible across different
     generations. So it is a good idea to save and restore
-output: a 3-D tensor with the shape of [seq_length, batch_size,
+output: A 3-D tensor with the shape of [seq_length, batch_size,
     dir * num_units].
-output_h: the same shape has input_h.
-output_c: the same shape as input_c for LSTM. An empty tensor for other models.
+output_h: The same shape has input_h.
+output_c: The same shape as input_c for LSTM. An empty tensor for other models.
 output_backprop: A 3-D tensor with the same shape as output in the forward pass.
 output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
     pass.
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackpropV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackpropV2.pbtxt
new file mode 100644
index 0000000000..03aa9cc250
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackpropV2.pbtxt
@@ -0,0 +1,49 @@
+op {
+  graph_op_name: "CudnnRNNBackpropV2"
+  visibility: HIDDEN
+  summary: "Backprop step of CudnnRNN."
+  description: <<END
+Compute the backprop of both data and weights in a RNN. Takes an extra
+    "host_reserved" inupt than CudnnRNNBackprop, which is used to determine RNN
+    cudnnRNNAlgo_t and cudnnMathType_t.
+
+rnn_mode: Indicates the type of the RNN model.
+input_mode: Indicates whether there is a linear projection between the input and
+    the actual computation before the first layer. 'skip_input' is only allowed
+    when input_size == num_units; 'auto_select' implies 'skip_input' when
+    input_size == num_units; otherwise, it implies 'linear_input'.
+direction: Indicates whether a bidirectional model will be used. Should be
+  "unidirectional" or "bidirectional".
+dropout: Dropout probability. When set to 0., dropout is disabled.
+seed: The 1st part of a seed to initialize dropout.
+seed2: The 2nd part of a seed to initialize dropout.
+input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+    num_units].
+input_c: For LSTM, a 3-D tensor with the shape of
+    [num_layer * dir, batch, num_units]. For other models, it is ignored.
+params: A 1-D tensor that contains the weights and biases in an opaque layout.
+    The size must be created through CudnnRNNParamsSize, and initialized
+    separately. Note that they might not be compatible across different
+    generations. So it is a good idea to save and restore
+output: A 3-D tensor with the shape of [seq_length, batch_size,
+    dir * num_units].
+output_h: The same shape has input_h.
+output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+    pass.
+output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+    pass.
+reserve_space: The same reserve_space produced in the forward operation.
+host_reserved: The same host_reserved produced in the forward operation.
+input_backprop: The backprop to input in the forward pass. Has the same shape
+    as input.
+input_h_backprop: The backprop to input_h in the forward pass. Has the same
+    shape as input_h.
+input_c_backprop: The backprop to input_c in the forward pass. Has the same
+    shape as input_c.
+params_backprop: The backprop to the params buffer in the forward pass. Has the
+    same shape as params.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNNV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNNV2.pbtxt
new file mode 100644
index 0000000000..c8a39de68c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNNV2.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "CudnnRNNV2"
+  visibility: HIDDEN
+  summary: "A RNN backed by cuDNN."
+  description: <<END
+Computes the RNN from the input and initial states, with respect to the params
+buffer. Produces one extra output "host_reserved" than CudnnRNN.
+
+rnn_mode: Indicates the type of the RNN model.
+input_mode: Indicates whether there is a linear projection between the input and
+  the actual computation before the first layer. 'skip_input' is only allowed
+  when input_size == num_units; 'auto_select' implies 'skip_input' when
+  input_size == num_units; otherwise, it implies 'linear_input'.
+direction: Indicates whether a bidirectional model will be used. Should be
+  "unidirectional" or "bidirectional".
+dropout: Dropout probability. When set to 0., dropout is disabled.
+seed: The 1st part of a seed to initialize dropout.
+seed2: The 2nd part of a seed to initialize dropout.
+input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+    num_units].
+input_c: For LSTM, a 3-D tensor with the shape of
+    [num_layer * dir, batch, num_units]. For other models, it is ignored.
+params: A 1-D tensor that contains the weights and biases in an opaque layout.
+    The size must be created through CudnnRNNParamsSize, and initialized
+    separately. Note that they might not be compatible across different
+    generations. So it is a good idea to save and restore
+output: A 3-D tensor with the shape of [seq_length, batch_size,
+    dir * num_units].
+output_h: The same shape has input_h.
+output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+is_training: Indicates whether this operation is used for inferenece or
+  training.
+reserve_space: An opaque tensor that can be used in backprop calculation. It
+  is only produced if is_training is true.
+host_reserved: An opaque tensor that can be used in backprop calculation. It is
+  only produced if is_training is true. It is output on host memory rather than
+  device memory.
+END
+}
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index f715cddfa6..6355f13654 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -943,6 +943,7 @@ tf_kernel_library(
     srcs = ["cudnn_rnn_ops.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        ":gpu_util_hdrs",
         "//tensorflow/core:cudnn_rnn_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index 762c2c3666..25560b7c28 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/gpu_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -43,6 +44,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/util/use_cudnn.h"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
@@ -78,7 +80,9 @@ using CPUDevice = Eigen::ThreadPoolDevice;
 #if GOOGLE_CUDA
 
 using GPUDevice = Eigen::GpuDevice;
+using se::Stream;
 using se::StreamExecutor;
+using se::dnn::RnnDescriptor;
 
 template <typename Device, typename T, typename Index>
 class CudnnRNNParamsSizeOp;
@@ -95,6 +99,12 @@ class CudnnRNNForwardOp;
 template <typename Device, typename T>
 class CudnnRNNBackwardOp;
 
+template <typename Device, typename T>
+class CudnnRNNForwardOpV2;
+
+template <typename Device, typename T>
+class CudnnRNNBackwardOpV2;
+
 enum class TFRNNInputMode {
   kRNNLinearInput = 0,
   kRNNSkipInput = 1,
@@ -105,11 +115,9 @@ namespace {
 using se::DeviceMemory;
 using se::DeviceMemoryBase;
 using se::ScratchAllocator;
-using se::Stream;
 using se::dnn::AlgorithmConfig;
 using se::dnn::AlgorithmDesc;
 using se::dnn::ProfileResult;
-using se::dnn::RnnDescriptor;
 using se::dnn::RnnDirectionMode;
 using se::dnn::RnnInputMode;
 using se::dnn::RnnMode;
@@ -118,6 +126,98 @@ using se::dnn::RnnStateTensorDescriptor;
 using se::dnn::ToDataType;
 using se::port::StatusOr;
 
+uint64 HashList(const std::vector<int>& list) {
+  if (list.empty()) {
+    return 0;
+  }
+  uint64 hash_code = list[0];
+  for (int i = 1; i < list.size(); i++) {
+    hash_code = Hash64Combine(hash_code, list[i]);
+  }
+  return hash_code;
+}
+
+// Encapsulate all the shape information that is used in both forward and
+// backward rnn operations.
+class CudnnRnnParameters {
+ public:
+  CudnnRnnParameters(int num_layers, int input_size, int num_units,
+                     int seq_length, int batch_size, int dir_count,
+                     bool has_dropout, bool is_training, RnnMode rnn_mode,
+                     TFRNNInputMode rnn_input_mode, DataType dtype)
+      : num_layers_(num_layers),
+        input_size_(input_size),
+        num_units_(num_units),
+        seq_length_(seq_length),
+        batch_size_(batch_size),
+        dir_count_(dir_count),
+        has_dropout_(has_dropout),
+        is_training_(is_training),
+        rnn_mode_(rnn_mode),
+        rnn_input_mode_(rnn_input_mode),
+        dtype_(dtype) {
+    hash_code_ = HashList(
+        {num_layers, input_size, num_units, seq_length, batch_size, dir_count,
+         static_cast<int>(has_dropout), static_cast<int>(is_training),
+         static_cast<int>(rnn_mode), static_cast<int>(rnn_input_mode), dtype});
+  }
+
+  bool operator==(const CudnnRnnParameters& other) const {
+    return this->get_data_as_tuple() == other.get_data_as_tuple();
+  }
+
+  bool operator!=(const CudnnRnnParameters& other) const {
+    return !(*this == other);
+  }
+  uint64 hash() const { return hash_code_; }
+
+  string ToString() const {
+    std::vector<string> fields = {
+        std::to_string(num_layers_),
+        std::to_string(input_size_),
+        std::to_string(num_units_),
+        std::to_string(seq_length_),
+        std::to_string(batch_size_),
+        std::to_string(dir_count_),
+        std::to_string(has_dropout_),
+        std::to_string(is_training_),
+        std::to_string(static_cast<int>(rnn_mode_)),
+        std::to_string(static_cast<int>(rnn_input_mode_)),
+        std::to_string(static_cast<int>(dtype_))};
+    return str_util::Join(fields, ", ");
+  }
+
+ private:
+  using ParameterDataType = std::tuple<int, int, int, int, int, int, bool, bool,
+                                       RnnMode, TFRNNInputMode, DataType>;
+
+  ParameterDataType get_data_as_tuple() const {
+    return std::make_tuple(num_layers_, input_size_, num_units_, seq_length_,
+                           batch_size_, dir_count_, has_dropout_, is_training_,
+                           rnn_mode_, rnn_input_mode_, dtype_);
+  }
+
+  const int num_layers_;
+  const int input_size_;
+  const int num_units_;
+  const int seq_length_;
+  const int batch_size_;
+  const int dir_count_;
+  const bool has_dropout_;
+  const bool is_training_;
+  const RnnMode rnn_mode_;
+  const TFRNNInputMode rnn_input_mode_;
+  const DataType dtype_;
+  uint64 hash_code_;
+};
+
+struct RnnAutoTuneGroup {
+  static string name() { return "Rnn"; }
+};
+
+using AutoTuneRnnConfigMap =
+    AutoTuneSingleton<RnnAutoTuneGroup, CudnnRnnParameters, AlgorithmConfig>;
+
 Status ParseRNNMode(const string& str, RnnMode* rnn_mode) {
   if (str == "rnn_relu") {
     *rnn_mode = RnnMode::kRnnRelu;
@@ -215,8 +315,7 @@ DeviceMemoryBase SliceDeviceMemory(const DeviceMemoryBase& device_memory,
 
 inline Status FromExecutorStatus(const se::port::Status& s) {
   return s.ok() ? Status::OK()
-                : Status(static_cast<tensorflow::error::Code>(
-                             static_cast<int>(s.code())),
+                : Status(static_cast<error::Code>(static_cast<int>(s.code())),
                          s.error_message());
 }
 
@@ -412,24 +511,29 @@ struct CudnnRnnModelShapes {
   }
 };
 
-// Utility class for using CudnnRnnModelShapes as a hash table key.
-struct CudnnRnnModelShapesHasher {
-  uint64 operator()(const CudnnRnnModelShapes& to_hash) const {
-    uint64 hash = static_cast<uint64>(to_hash.num_layers);
-    hash = tensorflow::FingerprintCat64(
-        hash, static_cast<uint64>(to_hash.input_size));
-    hash = tensorflow::FingerprintCat64(hash,
-                                        static_cast<uint64>(to_hash.num_units));
-    return tensorflow::FingerprintCat64(hash,
-                                        static_cast<uint64>(to_hash.dir_count));
+// Utility class for using CudnnRnnConfig and AlgorithmDesc pair a hash table
+// key.
+struct CudnnRnnConfigHasher {
+  uint64 operator()(
+      const std::pair<CudnnRnnModelShapes, AlgorithmDesc>& to_hash) const {
+    auto& shapes = to_hash.first;
+    auto& algo_desc = to_hash.second;
+
+    uint64 hash =
+        HashList({shapes.num_layers, shapes.input_size, shapes.num_units,
+                  shapes.dir_count, shapes.batch_size});
+    hash = Hash64Combine(hash, algo_desc.hash());
+    return hash;
   }
 };
 
-// Utility class for using CudnnRnnModelShapes as a hash table key.
-struct CudnnRnnModelShapesComparator {
-  bool operator()(const CudnnRnnModelShapes& first,
-                  const CudnnRnnModelShapes& second) const {
-    return first.IsCompatibleWith(second);
+// Utility class for using CudnnRnnModelShapes and AlgorithmDesc pair as a hash
+// table key.
+struct CudnnRnnConfigComparator {
+  bool operator()(
+      const std::pair<CudnnRnnModelShapes, AlgorithmDesc>& lhs,
+      const std::pair<CudnnRnnModelShapes, AlgorithmDesc>& rhs) const {
+    return lhs.first.IsCompatibleWith(rhs.first) && lhs.second == rhs.second;
   }
 };
 
@@ -717,7 +821,7 @@ class CudnnRNNKernelCommon : public OpKernel {
   RnnDirectionMode rnn_direction_mode() const {
     return model_types_.rnn_direction_mode;
   }
-  CudnnModelTypes model_types() const { return model_types_; }
+  const CudnnModelTypes& model_types() const { return model_types_; }
   float dropout() const { return dropout_; }
   uint64 seed() { return (static_cast<uint64>(seed_) << 32) | seed2_; }
   bool ResetRndGenState() { return reset_rnd_gen_state_; }
@@ -753,9 +857,9 @@ class CudnnRNNKernelCommon : public OpKernel {
     // random number generator, therefore set state_allocator to nullptr.
     const AlgorithmConfig algo_config;
     auto rnn_desc_s = stream->parent()->createRnnDescriptor(
-        num_layers, num_units, input_size, input_mode, rnn_direction_mode(),
-        rnn_mode(), ToDataType<T>::value, algo_config, dropout(), seed(),
-        nullptr /* state_allocator */);
+        num_layers, num_units, input_size, /*batch_size=*/0, input_mode,
+        rnn_direction_mode(), rnn_mode(), ToDataType<T>::value, algo_config,
+        dropout(), seed(), /* state_allocator=*/nullptr);
     if (!rnn_desc_s.ok()) {
       return FromExecutorStatus(rnn_desc_s);
     }
@@ -774,8 +878,9 @@ class CudnnRNNKernelCommon : public OpKernel {
     se::dnn::DataType data_type = ToDataType<T>::value;
     auto rnn_desc_s = executor->createRnnDescriptor(
         model_shapes.num_layers, model_shapes.num_units,
-        model_shapes.input_size, input_mode, rnn_direction_mode(), rnn_mode(),
-        data_type, algo_config, dropout(), seed(), dropout_state_allocator);
+        model_shapes.input_size, model_shapes.batch_size, input_mode,
+        rnn_direction_mode(), rnn_mode(), data_type, algo_config, dropout(),
+        seed(), dropout_state_allocator);
     TF_RETURN_IF_ERROR(rnn_desc_s.status());
 
     *rnn_desc = rnn_desc_s.ConsumeValueOrDie();
@@ -783,8 +888,9 @@ class CudnnRNNKernelCommon : public OpKernel {
   }
 
   using RnnStateCache =
-      gtl::FlatMap<CudnnRnnModelShapes, RnnScratchSpace,
-                   CudnnRnnModelShapesHasher, CudnnRnnModelShapesComparator>;
+      gtl::FlatMap<std::pair<CudnnRnnModelShapes, AlgorithmDesc>,
+                   RnnScratchSpace, CudnnRnnConfigHasher,
+                   CudnnRnnConfigComparator>;
   // Returns a raw rnn descriptor pointer. The cache owns the rnn descriptor and
   // should outlive the returned pointer.
   template <typename T>
@@ -794,7 +900,8 @@ class CudnnRNNKernelCommon : public OpKernel {
                                 const AlgorithmConfig& algo_config,
                                 RnnStateCache* cache,
                                 RnnDescriptor** rnn_desc) {
-    RnnScratchSpace& rnn_state = (*cache)[model_shapes];
+    auto key = std::make_pair(model_shapes, algo_config.algorithm());
+    RnnScratchSpace& rnn_state = (*cache)[key];
     if (rnn_state.rnn_desc == nullptr || ResetRndGenState()) {
       CudnnRNNPersistentSpaceAllocator* dropout_state_allocator =
           new CudnnRNNPersistentSpaceAllocator(context);
@@ -823,7 +930,6 @@ class CudnnRNNKernelCommon : public OpKernel {
 template <typename T, typename Index>
 class CudnnRNNParamsSizeOp<GPUDevice, T, Index> : public CudnnRNNKernelCommon {
  public:
-  typedef GPUDevice Device;
   explicit CudnnRNNParamsSizeOp(OpKernelConstruction* context)
       : CudnnRNNKernelCommon(context) {}
 
@@ -862,7 +968,6 @@ TF_CALL_double(REGISTER_GPU);
 template <typename T>
 class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
  public:
-  typedef GPUDevice Device;
   explicit CudnnRNNParamsToCanonical(OpKernelConstruction* context)
       : CudnnRNNKernelCommon(context) {
     OP_REQUIRES_OK(context, context->GetAttr("num_params", &num_params_));
@@ -997,7 +1102,6 @@ TF_CALL_double(REGISTER_GPU);
 template <typename T>
 class CudnnRNNCanonicalToParams<GPUDevice, T> : public CudnnRNNKernelCommon {
  public:
-  typedef GPUDevice Device;
   explicit CudnnRNNCanonicalToParams(OpKernelConstruction* context)
       : CudnnRNNKernelCommon(context) {}
 
@@ -1043,13 +1147,26 @@ TF_CALL_double(REGISTER_GPU);
 template <typename T>
 class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
  public:
-  typedef GPUDevice Device;
   explicit CudnnRNNForwardOp(OpKernelConstruction* context)
       : CudnnRNNKernelCommon(context) {
     OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_));
+
+    // Read debug env variables.
+    is_debug_mode_ = DebugCudnnRnn();
+    debug_cudnn_rnn_algo_ = DebugCudnnRnnAlgo();
+    debug_use_tensor_ops_ = DebugCudnnRnnUseTensorOps();
   }
 
   void Compute(OpKernelContext* context) override {
+    AlgorithmConfig algo_config;
+    ComputeAndReturnAlgorithm(context, &algo_config);
+  }
+
+ protected:
+  virtual void ComputeAndReturnAlgorithm(OpKernelContext* context,
+                                         AlgorithmConfig* output_algo_config) {
+    CHECK_NE(output_algo_config, nullptr);
+
     const Tensor* input = nullptr;
     const Tensor* input_h = nullptr;
     const Tensor* input_c = nullptr;
@@ -1069,7 +1186,6 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     OP_REQUIRES_OK(context, AllocateOutputs(context, model_shapes, &output,
                                             &output_h, &output_c));
 
-    AlgorithmConfig algo_config;
     // Creates a memory callback for the reserve_space. The memory lives in the
     // output of this kernel. And it will be fed into the backward pass when
     // needed.
@@ -1077,14 +1193,25 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     // Creates a memory callback for the workspace. The memory lives to the end
     // of this kernel calls.
     CudnnRnnAllocatorInTemp<uint8> workspace_allocator(context);
+
+    if (is_debug_mode_) {
+      AlgorithmDesc algo_desc(debug_cudnn_rnn_algo_, debug_use_tensor_ops_);
+      output_algo_config->set_algorithm(algo_desc);
+    } else {
+      OP_REQUIRES_OK(context,
+                     MaybeAutoTune(context, model_shapes, input_mode, input,
+                                   input_h, input_c, params, output, output_h,
+                                   output_c, output_algo_config));
+    }
+
     Status launch_status;
     {
       mutex_lock l(mu_);
       RnnDescriptor* rnn_desc_ptr = nullptr;
       OP_REQUIRES_OK(
           context, GetCachedRnnDescriptor<T>(context, model_shapes, input_mode,
-                                             algo_config, &rnn_state_cache_,
-                                             &rnn_desc_ptr));
+                                             *output_algo_config,
+                                             &rnn_state_cache_, &rnn_desc_ptr));
       launch_status = DoForward<T>(
           context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h,
           input_c, params, is_training_, output, output_h, output_c,
@@ -1094,6 +1221,25 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     OP_REQUIRES_OK(context, launch_status);
   }
 
+ protected:
+  virtual Status MaybeAutoTune(OpKernelContext* context,
+                               const CudnnRnnModelShapes& model_shapes,
+                               const RnnInputMode& input_mode,
+                               const Tensor* input, const Tensor* input_h,
+                               const Tensor* input_c, const Tensor* params,
+                               Tensor* output, Tensor* output_h,
+                               Tensor* output_c,
+                               AlgorithmConfig* best_algo_config) {
+    CHECK_NE(best_algo_config, nullptr);
+    *best_algo_config = AlgorithmConfig();
+    return Status::OK();
+  }
+
+  bool is_training() const { return is_training_; }
+  bool is_debug_mode_;
+  bool debug_use_tensor_ops_;
+  int64 debug_cudnn_rnn_algo_;
+
  private:
   Status AllocateOutputs(OpKernelContext* context,
                          const CudnnRnnModelShapes& model_shapes,
@@ -1135,12 +1281,197 @@ TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #undef REGISTER_GPU
 
+template <typename T>
+class CudnnRNNForwardOpV2<GPUDevice, T>
+    : public CudnnRNNForwardOp<GPUDevice, T> {
+ private:
+  using CudnnRNNForwardOp<GPUDevice, T>::is_training;
+  using CudnnRNNKernelCommon::CreateRnnDescriptor;
+  using CudnnRNNKernelCommon::dropout;
+  using CudnnRNNKernelCommon::HasInputC;
+  using CudnnRNNKernelCommon::model_types;
+
+ public:
+  explicit CudnnRNNForwardOpV2(OpKernelConstruction* context)
+      : CudnnRNNForwardOp<GPUDevice, T>(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    AlgorithmConfig best_algo_config;
+    CudnnRNNForwardOp<GPUDevice, T>::ComputeAndReturnAlgorithm(
+        context, &best_algo_config);
+    if (!context->status().ok()) {
+      return;
+    }
+
+    Tensor* output_host_reserved = nullptr;
+    // output_host_reserved stores opaque info used for backprop when running
+    // in training mode. At present, it includes a serialization of the best
+    // AlgorithmDesc picked during rnn forward pass autotune.
+    // int8 algorithm_id
+    // int8 use_tensor_op
+    // If autotune is not enabled, the algorithm_id is
+    // stream_executor::dnn::kDefaultAlgorithm and use_tensor_op is false. If
+    // running in inference mode, the output_host_reserved is currently not
+    // populated.
+    if (is_training()) {
+      OP_REQUIRES_OK(context, context->allocate_output(4, TensorShape({2}),
+                                                       &output_host_reserved));
+      auto output_host_reserved_int8 = output_host_reserved->vec<int8>();
+      output_host_reserved_int8(0) = best_algo_config.algorithm().algo_id();
+      output_host_reserved_int8(1) =
+          best_algo_config.algorithm().tensor_ops_enabled();
+    } else {
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(4, {}, &output_host_reserved));
+    }
+  }
+
+ protected:
+  Status MaybeAutoTune(OpKernelContext* context,
+                       const CudnnRnnModelShapes& model_shapes,
+                       const RnnInputMode& input_mode, const Tensor* input,
+                       const Tensor* input_h, const Tensor* input_c,
+                       const Tensor* params, Tensor* output, Tensor* output_h,
+                       Tensor* output_c,
+                       AlgorithmConfig* algo_config) override {
+    CHECK_NE(algo_config, nullptr);
+    if (!CudnnRnnUseAutotune() || this->is_debug_mode_) {
+      *algo_config = AlgorithmConfig();
+      return Status::OK();
+    }
+
+    std::vector<AlgorithmDesc> algorithms;
+    auto* stream = context->op_device_context()->stream();
+    CHECK(stream->parent()->GetRnnAlgorithms(&algorithms));
+    if (algorithms.empty()) {
+      LOG(WARNING) << "No Rnn algorithm found";
+      return Status::OK();
+    }
+
+    const auto& modeltypes = model_types();
+    CudnnRnnParameters rnn_params(
+        model_shapes.num_layers, model_shapes.input_size,
+        model_shapes.num_units, model_shapes.seq_length,
+        model_shapes.batch_size, model_shapes.dir_count,
+        /*has_dropout=*/std::abs(dropout()) > 1e-8, is_training(),
+        modeltypes.rnn_mode, modeltypes.rnn_input_mode, input->dtype());
+
+    if (AutoTuneRnnConfigMap::GetInstance()->Find(rnn_params, algo_config)) {
+      return Status::OK();
+    }
+
+    // Create temp tensors when profiling backprop pass.
+    auto data_type = input->dtype();
+    Tensor output_backprop;
+    Tensor output_h_backprop;
+    Tensor output_c_backprop;
+    Tensor input_backprop;
+    Tensor input_h_backprop;
+    Tensor input_c_backprop;
+    Tensor params_backprop;
+    if (is_training()) {
+      TF_RETURN_IF_ERROR(context->allocate_temp(
+          data_type, model_shapes.output_shape, &output_backprop));
+      TF_RETURN_IF_ERROR(context->allocate_temp(
+          data_type, model_shapes.hidden_state_shape, &output_h_backprop));
+
+      TF_RETURN_IF_ERROR(
+          context->allocate_temp(data_type, params->shape(), &params_backprop));
+      TF_RETURN_IF_ERROR(context->allocate_temp(
+          data_type, model_shapes.input_shape, &input_backprop));
+      TF_RETURN_IF_ERROR(context->allocate_temp(
+          data_type, model_shapes.hidden_state_shape, &input_h_backprop));
+      if (HasInputC()) {
+        TF_RETURN_IF_ERROR(context->allocate_temp(
+            data_type, model_shapes.hidden_state_shape, &output_c_backprop));
+        TF_RETURN_IF_ERROR(context->allocate_temp(
+            data_type, model_shapes.hidden_state_shape, &input_c_backprop));
+      }
+    }
+    ProfileResult best_result;
+    for (auto& algo : algorithms) {
+      Status status;
+      ProfileResult final_profile_result;
+
+      ProfileResult fwd_profile_result;
+      ProfileResult bak_profile_result;
+
+      // RnnDescriptor is algorithm-dependent, thus not reusable.
+      std::unique_ptr<RnnDescriptor> rnn_desc;
+      // Use a temp scratch allocator for the random num generator.
+      CudnnRnnAllocatorInTemp<uint8> dropout_state_allocator(context);
+      if (!this->template CreateRnnDescriptor<T>(
+                   context, model_shapes, input_mode, AlgorithmConfig(algo),
+                   &dropout_state_allocator, &rnn_desc)
+               .ok()) {
+        continue;
+      }
+
+      // Again use temp scratch allocator during profiling.
+      CudnnRnnAllocatorInTemp<T> reserve_space_allocator(context);
+      CudnnRnnAllocatorInTemp<uint8> workspace_allocator(context);
+      status = DoForward<T>(
+          context, *rnn_desc.get(), model_types(), model_shapes, input, input_h,
+          input_c, params, is_training(), output, output_h, output_c,
+          &reserve_space_allocator, &workspace_allocator, &fwd_profile_result);
+      if (!status.ok()) {
+        continue;
+      }
+
+      if (is_training()) {
+        // Get reserve space from the forward pass.
+        Tensor reserve_space = reserve_space_allocator.get_allocated_tensor(0);
+        status = DoBackward<T>(
+            context, *rnn_desc.get(), model_types(), model_shapes, input,
+            input_h, input_c, params, output, output_h, output_c,
+            &output_backprop, &output_h_backprop, &output_c_backprop,
+            &reserve_space, &input_backprop, &input_h_backprop,
+            &input_c_backprop, &params_backprop, &workspace_allocator,
+            &bak_profile_result);
+        if (!status.ok()) {
+          continue;
+        }
+        final_profile_result.set_elapsed_time_in_ms(
+            fwd_profile_result.elapsed_time_in_ms() +
+            bak_profile_result.elapsed_time_in_ms());
+      } else {
+        final_profile_result = fwd_profile_result;
+      }
+
+      auto total_time = final_profile_result.elapsed_time_in_ms();
+      VLOG(1) << "Profile Cudnn RNN algo " << algo.algo_id()
+              << " run time: " << total_time << " ms";
+      if (total_time < best_result.elapsed_time_in_ms()) {
+        best_result.set_elapsed_time_in_ms(total_time);
+        best_result.set_algorithm(algo);
+      }
+    }
+
+    if (!best_result.is_valid()) {
+      return Status(error::Code::INTERNAL, "No algorithm worked!");
+    }
+    algo_config->set_algorithm(best_result.algorithm());
+    AutoTuneRnnConfigMap::GetInstance()->Insert(rnn_params, *algo_config);
+    return Status::OK();
+  }
+};
+
+#define REGISTER_GPU(T)                                    \
+  REGISTER_KERNEL_BUILDER(Name("CudnnRNNV2")               \
+                              .Device(DEVICE_GPU)          \
+                              .HostMemory("host_reserved") \
+                              .TypeConstraint<T>("T"),     \
+                          CudnnRNNForwardOpV2<GPUDevice, T>);
+
+TF_CALL_half(REGISTER_GPU);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+#undef REGISTER_GPU
+
 // Run the backward operation of the RNN model.
 template <typename T>
 class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
  public:
-  typedef GPUDevice Device;
-
   explicit CudnnRNNBackwardOp(OpKernelConstruction* context)
       : CudnnRNNKernelCommon(context) {}
 
@@ -1183,15 +1514,16 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     // Creates a memory callback for the workspace. The memory lives to the end
     // of this kernel calls.
     CudnnRnnAllocatorInTemp<uint8> workspace_allocator(context);
-    const AlgorithmConfig default_algo_config;
+    AlgorithmConfig algo_config;
+    OP_REQUIRES_OK(context, GetAlgorithm(context, &algo_config));
     Status launch_status;
     {
       mutex_lock l(mu_);
       RnnDescriptor* rnn_desc_ptr = nullptr;
       OP_REQUIRES_OK(
           context, GetCachedRnnDescriptor<T>(context, model_shapes, input_mode,
-                                             default_algo_config,
-                                             &rnn_state_cache_, &rnn_desc_ptr));
+                                             algo_config, &rnn_state_cache_,
+                                             &rnn_desc_ptr));
       launch_status = DoBackward<T>(
           context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h,
           input_c, params, output, output_h, output_c, output_backprop,
@@ -1202,6 +1534,14 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     OP_REQUIRES_OK(context, launch_status);
   }
 
+ protected:
+  virtual Status GetAlgorithm(OpKernelContext* context,
+                              AlgorithmConfig* algo_config) {
+    CHECK_NE(algo_config, nullptr);
+    *algo_config = AlgorithmConfig();
+    return Status::OK();
+  }
+
  private:
   mutex mu_;
   RnnStateCache rnn_state_cache_ GUARDED_BY(mu_);
@@ -1300,6 +1640,39 @@ TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #undef REGISTER_GPU
 
+template <typename T>
+class CudnnRNNBackwardOpV2<GPUDevice, T>
+    : public CudnnRNNBackwardOp<GPUDevice, T> {
+ public:
+  explicit CudnnRNNBackwardOpV2(OpKernelConstruction* context)
+      : CudnnRNNBackwardOp<GPUDevice, T>(context) {}
+
+ protected:
+  Status GetAlgorithm(OpKernelContext* context,
+                      AlgorithmConfig* algo_config) override {
+    CHECK_NE(algo_config, nullptr);
+    const Tensor* host_reserved = nullptr;
+    TF_RETURN_IF_ERROR(context->input("host_reserved", &host_reserved));
+
+    auto host_reserved_int8 = host_reserved->vec<int8>();
+    const AlgorithmDesc algo_desc(host_reserved_int8(0), host_reserved_int8(1));
+    algo_config->set_algorithm(algo_desc);
+    return Status::OK();
+  }
+};
+
+#define REGISTER_GPU(T)                                    \
+  REGISTER_KERNEL_BUILDER(Name("CudnnRNNBackpropV2")       \
+                              .Device(DEVICE_GPU)          \
+                              .HostMemory("host_reserved") \
+                              .TypeConstraint<T>("T"),     \
+                          CudnnRNNBackwardOpV2<GPUDevice, T>);
+
+TF_CALL_half(REGISTER_GPU);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+#undef REGISTER_GPU
+
 // TODO(zhengxq): Add the conversion of Cudnn RNN Params from and to
 // its canonical form.
 
diff --git a/tensorflow/core/ops/cudnn_rnn_ops.cc b/tensorflow/core/ops/cudnn_rnn_ops.cc
index 37d70a22ef..f78f7a897a 100644
--- a/tensorflow/core/ops/cudnn_rnn_ops.cc
+++ b/tensorflow/core/ops/cudnn_rnn_ops.cc
@@ -99,6 +99,49 @@ REGISTER_OP("CudnnRNN")
       return Status::OK();
     });
 
+REGISTER_OP("CudnnRNNV2")
+    .Input("input: T")
+    .Input("input_h: T")
+    .Input("input_c: T")
+    .Input("params: T")
+    .SetIsStateful()
+    .Output("output: T")
+    .Output("output_h: T")
+    .Output("output_c: T")
+    .Output("reserve_space: T")
+    .Output("host_reserved: int8")
+    .Attr("T: {float16, float32, float64}")
+    .Attr(kRNNModeAttrs)
+    .Attr(kRNNInputModeAttrs)
+    .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
+    .Attr("is_training: bool = true")
+    .SetShapeFn([](InferenceContext* c) {
+      auto input_shape = c->input(0);
+      auto input_h_shape = c->input(1);
+      auto seq_length = c->Dim(input_shape, 0);
+      auto batch_size = c->Dim(input_shape, 1);
+      auto num_units = c->Dim(input_h_shape, 2);
+      string direction;
+      TF_RETURN_IF_ERROR(c->GetAttr("direction", &direction));
+      string rnn_mode;
+      TF_RETURN_IF_ERROR(c->GetAttr("rnn_mode", &rnn_mode));
+      int dir_count = (direction == "bidirectional") ? 2 : 1;
+      DimensionHandle output_size;
+      TF_RETURN_IF_ERROR(c->Multiply(num_units, dir_count, &output_size));
+      auto output_shape = c->MakeShape({seq_length, batch_size, output_size});
+      auto output_h_shape = input_h_shape;
+      auto output_c_shape TF_ATTRIBUTE_UNUSED =
+          (rnn_mode == "lstm") ? output_h_shape : c->MakeShape({});
+      c->set_output(0, output_shape);
+      c->set_output(1, output_h_shape);
+      c->set_output(2, output_c_shape);
+      c->set_output(3, c->UnknownShape());
+      c->set_output(4, c->UnknownShape());
+      return Status::OK();
+    });
 
 REGISTER_OP("CudnnRNNBackprop")
     .Input("input: T")
@@ -136,6 +179,42 @@ REGISTER_OP("CudnnRNNBackprop")
       return Status::OK();
     });
 
+REGISTER_OP("CudnnRNNBackpropV2")
+    .Input("input: T")
+    .Input("input_h: T")
+    .Input("input_c: T")
+    .Input("params: T")
+    .Input("output: T")
+    .Input("output_h: T")
+    .Input("output_c: T")
+    .Input("output_backprop: T")
+    .Input("output_h_backprop: T")
+    .Input("output_c_backprop: T")
+    .Input("reserve_space: T")
+    .Input("host_reserved: int8")
+    .SetIsStateful()
+    .Output("input_backprop: T")
+    .Output("input_h_backprop: T")
+    .Output("input_c_backprop: T")
+    .Output("params_backprop: T")
+    .Attr("T: {float16, float32, float64}")
+    .Attr(kRNNModeAttrs)
+    .Attr(kRNNInputModeAttrs)
+    .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
+    .SetShapeFn([](InferenceContext* c) {
+      auto input_shape = c->input(0);
+      auto input_h_shape = c->input(1);
+      auto input_c_shape = c->input(2);
+      auto params_shape = c->input(3);
+      c->set_output(0, input_shape);
+      c->set_output(1, input_h_shape);
+      c->set_output(2, input_c_shape);
+      c->set_output(3, params_shape);
+      return Status::OK();
+    });
 
 REGISTER_OP("CudnnRNNParamsToCanonical")
     .Input("num_layers: int32")
diff --git a/tensorflow/core/ops/cudnn_rnn_ops_test.cc b/tensorflow/core/ops/cudnn_rnn_ops_test.cc
index 95d45c0bb8..2dd867561b 100644
--- a/tensorflow/core/ops/cudnn_rnn_ops_test.cc
+++ b/tensorflow/core/ops/cudnn_rnn_ops_test.cc
@@ -30,6 +30,24 @@ TEST(CudnnRNNOpsTest, ParamsSize_ShapeFn) {
 }
 
 TEST(CudnnRNNOpsTest, ForwardLstm_ShapeFn) {
+  int seq_length = 2;
+  int batch_size = 3;
+  int num_units = 4;
+  int num_layers = 5;
+  int dir_count = 1;
+  std::vector<int> input_shape = {seq_length, batch_size, num_units};
+  std::vector<int> input_h_shape = {num_layers * dir_count, batch_size,
+                                    num_units};
+  std::vector<int> output_shape = {seq_length, batch_size,
+                                   num_units * dir_count};
+  auto shape_to_str = [](const std::vector<int>& v) {
+    return strings::StrCat("[", str_util::Join(v, ","), "]");
+  };
+  string input_shapes_desc = strings::StrCat(
+      shape_to_str(input_shape), ";", shape_to_str(input_h_shape), ";",
+      shape_to_str(input_h_shape), ";", "[?]");
+  string output_shapes_desc = "[d0_0,d0_1,d1_2];in1;in1;?";
+
   ShapeInferenceTestOp op("CudnnRNN");
   TF_ASSERT_OK(NodeDefBuilder("test", "CudnnRNN")
                    .Input({"input", 0, DT_FLOAT})
@@ -40,6 +58,10 @@ TEST(CudnnRNNOpsTest, ForwardLstm_ShapeFn) {
                    .Attr("input_mode", "auto_select")
                    .Attr("direction", "unidirectional")
                    .Finalize(&op.node_def));
+  INFER_OK(op, input_shapes_desc, output_shapes_desc);
+}
+
+TEST(CudnnRNNOpsTest, ForwardV2Lstm_ShapeFn) {
   int seq_length = 2;
   int batch_size = 3;
   int num_units = 4;
@@ -56,7 +78,18 @@ TEST(CudnnRNNOpsTest, ForwardLstm_ShapeFn) {
   string input_shapes_desc = strings::StrCat(
       shape_to_str(input_shape), ";", shape_to_str(input_h_shape), ";",
       shape_to_str(input_h_shape), ";", "[?]");
-  string output_shapes_desc = "[d0_0,d0_1,d1_2];in1;in1;?";
+  string output_shapes_desc = "[d0_0,d0_1,d1_2];in1;in1;?;?";
+
+  ShapeInferenceTestOp op("CudnnRNNV2");
+  TF_ASSERT_OK(NodeDefBuilder("test", "CudnnRNNV2")
+                   .Input({"input", 0, DT_FLOAT})
+                   .Input({"input_h", 0, DT_FLOAT})
+                   .Input({"input_c", 0, DT_FLOAT})
+                   .Input({"params", 0, DT_FLOAT})
+                   .Attr("rnn_mode", "lstm")
+                   .Attr("input_mode", "auto_select")
+                   .Attr("direction", "unidirectional")
+                   .Finalize(&op.node_def));
   INFER_OK(op, input_shapes_desc, output_shapes_desc);
 }
 
diff --git a/tensorflow/core/util/use_cudnn.cc b/tensorflow/core/util/use_cudnn.cc
index d7d03f151e..c119df6419 100644
--- a/tensorflow/core/util/use_cudnn.cc
+++ b/tensorflow/core/util/use_cudnn.cc
@@ -22,9 +22,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-#define ADD_CUDNN_FLAG(func_name, flag_name, default_value)                \
+#define ADD_BOOL_CUDNN_FLAG(func_name, flag_name, default_value)           \
   bool func_name() {                                                       \
-    bool value;                                                            \
+    bool value = default_value;                                            \
     Status status = ReadBoolFromEnvVar(#flag_name, default_value, &value); \
     if (!status.ok()) {                                                    \
       LOG(ERROR) << status;                                                \
@@ -32,12 +32,44 @@ namespace tensorflow {
     return value;                                                          \
   }
 
-ADD_CUDNN_FLAG(CanUseCudnn, TF_USE_CUDNN, true);
-ADD_CUDNN_FLAG(CudnnUseAutotune, TF_CUDNN_USE_AUTOTUNE, true);
-ADD_CUDNN_FLAG(CudnnDisableConv1x1Optimization,
-               TF_CUDNN_DISABLE_CONV_1X1_OPTIMIZATION, false);
+ADD_BOOL_CUDNN_FLAG(CanUseCudnn, TF_USE_CUDNN, true);
+ADD_BOOL_CUDNN_FLAG(CudnnUseAutotune, TF_CUDNN_USE_AUTOTUNE, true);
+// Whether to auto-tuning Cudnn RNN forward and backward pass to pick
+// statistically the best cudnnRNNAlgo_t and cudnnMathType_t.
+// The flag is disabled when TF_DEBUG_CUDNN_RNN is turned on.
+ADD_BOOL_CUDNN_FLAG(CudnnRnnUseAutotune, TF_CUDNN_RNN_USE_AUTOTUNE, true);
+ADD_BOOL_CUDNN_FLAG(CudnnDisableConv1x1Optimization,
+                    TF_CUDNN_DISABLE_CONV_1X1_OPTIMIZATION, false);
 
-#undef ADD_CUDNN_FLAG
+// Whether to run Cudnn RNN forward and backward in debug mode, where users can
+// force a specified cudnnRNNAlgo_t and cudnnMathType_t, when used together with
+// the following two env vars:
+// TF_DEBUG_CUDNN_RNN_USE_TENSOR_OPS
+// TF_DEBUG_CUDNN_RNN_ALGO
+// By default it is disabled and only intended for testing and profiling.
+ADD_BOOL_CUDNN_FLAG(DebugCudnnRnn, TF_DEBUG_CUDNN_RNN, false);
+// If using TENSOR_OP_MATH in Cudnn RNN for both forward and backward pass. Only
+// effective when TF_DEBUG_CUDNN_RNN is true.
+// Note none of the persistent RNN algorithm support TENSOR_OP_MATH before
+// Cudnn 7.1. See Nvidia Cudnn manual for more details.
+ADD_BOOL_CUDNN_FLAG(DebugCudnnRnnUseTensorOps,
+                    TF_DEBUG_CUDNN_RNN_USE_TENSOR_OPS, false);
+#undef ADD_BOOL_CUDNN_FLAG
+
+#define ADD_INT64_CUDNN_FLAG(func_name, flag_name, default_value)           \
+  int64 func_name() {                                                       \
+    int64 value = default_value;                                            \
+    Status status = ReadInt64FromEnvVar(#flag_name, default_value, &value); \
+    if (!status.ok()) {                                                     \
+      LOG(ERROR) << status;                                                 \
+    }                                                                       \
+    return value;                                                           \
+  }
+// Cudnn RNN algorithm to use for both forward and backward pass. Only effective
+// when TF_DEBUG_CUDNN_RNN is true. See Nvidia Cudnn manual for allowed
+// cudnnRNNAlgo_t.
+ADD_INT64_CUDNN_FLAG(DebugCudnnRnnAlgo, TF_DEBUG_CUDNN_RNN_ALGO, -1);
+#undef ADD_INT64_CUDNN_FLAG
 
 FP16ConvMode CudnnConvComputeMode() {
   string value;
diff --git a/tensorflow/core/util/use_cudnn.h b/tensorflow/core/util/use_cudnn.h
index a39a032e3f..f8cc5944d7 100644
--- a/tensorflow/core/util/use_cudnn.h
+++ b/tensorflow/core/util/use_cudnn.h
@@ -15,8 +15,10 @@ limitations under the License.
 
 // The utility to check Cudnn dependency and set Cudnn-related flags.
 
-#ifndef TENSORFLOW_UTIL_USE_CUDNN_H_
-#define TENSORFLOW_UTIL_USE_CUDNN_H_
+#ifndef TENSORFLOW_CORE_UTIL_USE_CUDNN_H_
+#define TENSORFLOW_CORE_UTIL_USE_CUDNN_H_
+
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
@@ -31,9 +33,12 @@ enum class FP16ConvMode {
 
 bool CanUseCudnn();
 bool CudnnUseAutotune();
+bool CudnnRnnUseAutotune();
 bool CudnnDisableConv1x1Optimization();
 FP16ConvMode CudnnConvComputeMode();
-
+bool DebugCudnnRnn();
+bool DebugCudnnRnnUseTensorOps();
+int64 DebugCudnnRnnAlgo();
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_USE_CUDNN_H_
+#endif  // TENSORFLOW_CORE_UTIL_USE_CUDNN_H_
diff --git a/tensorflow/python/ops/cudnn_rnn_grad.py b/tensorflow/python/ops/cudnn_rnn_grad.py
index 97331bb5b5..c618c470f2 100644
--- a/tensorflow/python/ops/cudnn_rnn_grad.py
+++ b/tensorflow/python/ops/cudnn_rnn_grad.py
@@ -26,7 +26,7 @@ def _cudnn_rnn_backward(op, *grads):
   """Gradients for the CudnnRNN op."""
   if not op.get_attr("is_training"):
     raise ValueError(
-        "CudnnRNN must set is_training to True to be used in gradients")
+        "To use CudnnRNN in gradients, is_training must be set to True.")
   return gen_cudnn_rnn_ops.cudnn_rnn_backprop(
       input=op.inputs[0],
       input_h=op.inputs[1],
@@ -45,3 +45,29 @@ def _cudnn_rnn_backward(op, *grads):
       rnn_mode=op.get_attr("rnn_mode"),
       input_mode=op.get_attr("input_mode"),
       direction=op.get_attr("direction"))
+
+
+@ops.RegisterGradient("CudnnRNNV2")
+def _cudnn_rnn_backward_v2(op, *grad):
+  if not op.get_attr("is_training"):
+    raise ValueError(
+        "To use CudnnRNNV2 in gradients, is_training must be set to True.")
+  return gen_cudnn_rnn_ops.cudnn_rnn_backprop_v2(
+      input=op.inputs[0],
+      input_h=op.inputs[1],
+      input_c=op.inputs[2],
+      params=op.inputs[3],
+      output=op.outputs[0],
+      output_h=op.outputs[1],
+      output_c=op.outputs[2],
+      output_backprop=grad[0],
+      output_h_backprop=grad[1],
+      output_c_backprop=grad[2],
+      reserve_space=op.outputs[3],
+      host_reserved=op.outputs[4],
+      dropout=op.get_attr("dropout"),
+      seed=op.get_attr("seed"),
+      seed2=op.get_attr("seed2"),
+      rnn_mode=op.get_attr("rnn_mode"),
+      input_mode=op.get_attr("input_mode"),
+      direction=op.get_attr("direction"))
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 102419a264..42a77aa3f8 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@@ -312,7 +313,10 @@ CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM(
 // clang-format off
 #if CUDNN_VERSION >= 6000
 #define CUDNN_DNN_ROUTINE_EACH_R6(__macro)                    \
-  __macro(cudnnSetRNNDescriptor_v6)
+  __macro(cudnnSetRNNDescriptor_v6)                           \
+  __macro(cudnnCreatePersistentRNNPlan)                       \
+  __macro(cudnnDestroyPersistentRNNPlan)                      \
+  __macro(cudnnSetPersistentRNNPlan)
 
 // clang-format on
 CUDNN_DNN_ROUTINE_EACH_R6(STREAM_EXECUTOR_CUDNN_WRAP)
@@ -1195,7 +1199,7 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
  public:
   CudnnRnnDescriptor(CUDAExecutor* parent, cudnnHandle_t cudnn_handle,
                      int num_layers, int hidden_size, int input_size,
-                     cudnnRNNInputMode_t input_mode,
+                     int batch_size, cudnnRNNInputMode_t input_mode,
                      cudnnDirectionMode_t direction_mode,
                      cudnnRNNMode_t rnn_mode, cudnnDataType_t data_type,
                      cudnnDataType_t compute_type,
@@ -1207,6 +1211,10 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
         num_layers_(num_layers),
         hidden_size_(hidden_size),
         input_size_(input_size),
+        batch_size_(batch_size),
+#if CUDNN_VERSION >= 6000
+        rnn_plan_(nullptr),
+#endif
         input_mode_(input_mode),
         direction_mode_(direction_mode),
         rnn_mode_(rnn_mode),
@@ -1226,12 +1234,26 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
     CUDNN_RETURN_IF_FAIL(status, "Unable to create RNN descriptor");
 #if CUDNN_VERSION >= 6000
     // TODO: allow the user to choose an algorithm.
-    cudnnRNNAlgo_t rnn_algo = ToCudnnRNNAlgo(algorithm_config_.algorithm());
+    rnn_algo_ = ToCudnnRNNAlgo(algorithm_config_.algorithm());
     status = wrap::cudnnSetRNNDescriptor_v6(
-        parent, cudnn_handle, rnn_desc_ /*rnnDesc*/, hidden_size /*hiddenSize*/,
-        num_layers /*numLayers*/, dropout_handle() /*dropoutDesc*/,
-        input_mode /*inputMode*/, direction_mode /*direction*/,
-        rnn_mode /*mode*/, rnn_algo /*algo*/, compute_type /*dataType*/);
+        parent, cudnn_handle, /*rnnDesc=*/rnn_desc_, /*hiddenSize=*/hidden_size,
+        /*numLayers=*/num_layers, /*dropoutDesc=*/dropout_handle(),
+        /*inputMode=*/input_mode, /*direction=*/direction_mode,
+        /*mode=*/rnn_mode, /*algo=*/rnn_algo_, /*dataType=*/compute_type);
+    CUDNN_RETURN_IF_FAIL(status, ::tensorflow::strings::Printf(
+                                     "Unable to update RNN descriptor with "
+                                     "algo_id: %d and compute_type: %d",
+                                     static_cast<int>(rnn_algo_),
+                                     static_cast<int>(compute_type)));
+
+    if (rnn_algo_ == CUDNN_RNN_ALGO_PERSIST_DYNAMIC) {
+      CHECK_GE(batch_size_, 0);
+      status = wrap::cudnnCreatePersistentRNNPlan(
+          parent, rnn_desc_, batch_size_, data_type_, &rnn_plan_);
+      CUDNN_RETURN_IF_FAIL(status, "Unable to create persistent RNN plan.");
+      status = wrap::cudnnSetPersistentRNNPlan(parent, rnn_desc_, rnn_plan_);
+      CUDNN_RETURN_IF_FAIL(status, "Unable to update persistent RNN plan.");
+    }
 #else
     CHECK(algorithm_config_.is_default())
         << "Non-default algorithm not supported for CUDA version < 6.0";
@@ -1240,8 +1262,8 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
         num_layers /*numLayers*/, dropout_handle() /*dropoutDesc*/,
         input_mode /*inputMode*/, direction_mode /*direction*/,
         rnn_mode /*mode*/, compute_type /*dataType*/);
-#endif
     CUDNN_RETURN_IF_FAIL(status, "Unable to update RNN descriptor");
+#endif
 
     // Create the params handle.
     cudnn_params_desc_.reset(
@@ -1254,8 +1276,14 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
   }
   ~CudnnRnnDescriptor() override {
     if (rnn_desc_) {
-      cudnnStatus_t status =
-          wrap::cudnnDestroyRNNDescriptor(parent_, rnn_desc_);
+      cudnnStatus_t status;
+#if CUDNN_VERSION >= 6000
+      if (rnn_algo_ == CUDNN_RNN_ALGO_PERSIST_DYNAMIC && rnn_plan_) {
+        status = wrap::cudnnDestroyPersistentRNNPlan(parent_, rnn_plan_);
+        CUDNN_RETURN_IF_FAIL(status, "Unable to destroy persistent RNN plan.");
+      }
+#endif
+      status = wrap::cudnnDestroyRNNDescriptor(parent_, rnn_desc_);
       CUDNN_RETURN_IF_FAIL(status, "Unable to destroy RNN descriptor");
     }
   }
@@ -1280,6 +1308,7 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
   int num_layers() const { return num_layers_; }
   int hidden_size() const { return hidden_size_; }
   int input_size() const { return input_size_; }
+  int batch_size() const { return batch_size_; }
   cudnnRNNInputMode_t input_mode() const { return input_mode_; }
   cudnnDirectionMode_t direction_mode() const { return direction_mode_; }
   cudnnRNNMode_t rnn_mode() const { return rnn_mode_; }
@@ -1314,6 +1343,13 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
   int num_layers_;
   int hidden_size_;
   int input_size_;
+  // batch_size_ is set to -1 when not using CUDNN_RNN_ALGO_PERSIST_DYNAMIC
+  // algorithm.
+  int batch_size_;
+#if CUDNN_VERSION >= 6000
+  cudnnRNNAlgo_t rnn_algo_;
+  cudnnPersistentRNNPlan_t rnn_plan_;
+#endif
   cudnnRNNInputMode_t input_mode_;
   cudnnDirectionMode_t direction_mode_;
   cudnnRNNMode_t rnn_mode_;
@@ -1970,22 +2006,20 @@ bool CudnnSupport::DoRnnBackwardImpl(
 #endif  // CUDNN_VERSION
 
 port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
-CudnnSupport::createRnnDescriptor(int num_layers, int hidden_size,
-                                  int input_size, dnn::RnnInputMode input_mode,
-                                  dnn::RnnDirectionMode direction_mode,
-                                  dnn::RnnMode rnn_mode,
-                                  dnn::DataType data_type,
-                                  const dnn::AlgorithmConfig& algorithm_config,
-                                  float dropout, uint64 seed,
-                                  ScratchAllocator* state_allocator) {
+CudnnSupport::createRnnDescriptor(
+    int num_layers, int hidden_size, int input_size, int batch_size,
+    dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode,
+    dnn::RnnMode rnn_mode, dnn::DataType data_type,
+    const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed,
+    ScratchAllocator* state_allocator) {
 #if CUDNN_VERSION >= 5000
   mutex_lock lock{dnn_handle_mutex_};
   std::unique_ptr<CudnnRnnDescriptor> rnn_desc(new CudnnRnnDescriptor(
       parent_, ToHandle(dnn_handle_), num_layers, hidden_size, input_size,
-      ToCudnnRnnInputMode(input_mode), ToCudnnRnnDirectionMode(direction_mode),
-      ToCudnnRnnMode(rnn_mode), ToCudnnDataType(data_type),
-      GetRnnComputeType(data_type), algorithm_config, dropout, seed,
-      state_allocator));
+      batch_size, ToCudnnRnnInputMode(input_mode),
+      ToCudnnRnnDirectionMode(direction_mode), ToCudnnRnnMode(rnn_mode),
+      ToCudnnDataType(data_type), GetRnnComputeType(data_type),
+      algorithm_config, dropout, seed, state_allocator));
   if (!rnn_desc->ok()) {
     return rnn_desc->Status();
   }
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 5ded7cf154..7d53dbe4a5 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -48,7 +48,7 @@ class CudnnSupport : public dnn::DnnSupport {
   port::StatusOr<perftools::gputools::dnn::VersionInfo> GetVersion() override;
 
   port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
-      int num_layers, int hidden_size, int input_size,
+      int num_layers, int hidden_size, int input_size, int batch_size,
       dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode,
       dnn::RnnMode rnn_mode, dnn::DataType data_type,
       const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed,
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index 6edb572820..031c82d3f4 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -15,12 +15,17 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/dnn.h"
 
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
 namespace stream_executor {
 namespace dnn {
 
+uint64 AlgorithmDesc::hash() const {
+  return ::tensorflow::Hash64Combine(algo_, tensor_ops_enabled_);
+}
+
 bool DnnSupport::GetConvolveAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<AlgorithmDesc>* out_algorithms) {
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 39f21d8b10..0c2e083b39 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -712,6 +712,7 @@ class AlgorithmDesc {
     return this->algo_ == other.algo_ &&
            this->tensor_ops_enabled_ == other.tensor_ops_enabled_;
   }
+  uint64 hash() const;
 
  private:
   enum { kDefaultAlgorithm = -1 };
@@ -2023,7 +2024,7 @@ class DnnSupport {
   //    is no longer in use.
   virtual port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
   createRnnDescriptor(int num_layers, int hidden_size, int input_size,
-                      dnn::RnnInputMode input_mode,
+                      int batch_size, dnn::RnnInputMode input_mode,
                       dnn::RnnDirectionMode direction_mode,
                       dnn::RnnMode rnn_mode, dnn::DataType data_type,
                       const dnn::AlgorithmConfig& algorithm_config,
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 2e1adeb31e..20579790ef 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -350,7 +350,7 @@ bool StreamExecutor::GetBlasGemmAlgorithms(
 
 port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
 StreamExecutor::createRnnDescriptor(
-    int num_layers, int hidden_size, int input_size,
+    int num_layers, int hidden_size, int input_size, int batch_size,
     dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode,
     dnn::RnnMode rnn_mode, dnn::DataType data_type,
     const dnn::AlgorithmConfig &algorithm_config, float dropout, uint64 seed,
@@ -361,8 +361,9 @@ StreamExecutor::createRnnDescriptor(
                         "Fail to find the dnn implementation.");
   }
   return dnn_support->createRnnDescriptor(
-      num_layers, hidden_size, input_size, input_mode, direction_mode, rnn_mode,
-      data_type, algorithm_config, dropout, seed, state_allocator);
+      num_layers, hidden_size, input_size, batch_size, input_mode,
+      direction_mode, rnn_mode, data_type, algorithm_config, dropout, seed,
+      state_allocator);
 }
 
 port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index 39af7115d8..ab6b00f660 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -373,7 +373,7 @@ class StreamExecutor {
   // Create an RNN descriptor based on model shapes and configurations.
   // The caller retains the ownership of the descriptor.
   port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
-      int num_layers, int hidden_size, int input_size,
+      int num_layers, int hidden_size, int input_size, int batch_size,
       dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode,
       dnn::RnnMode rnn_mode, dnn::DataType data_type,
       const dnn::AlgorithmConfig &algorithm_config, float dropout, uint64 seed,
-- 
GitLab


From 752984871bb8a89e774f1c329d42b3bb7b7d7b3f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 19:18:20 -0700
Subject: [PATCH 445/450] Update ops-related pbtxt files.

PiperOrigin-RevId: 194335460
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 263 ++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 263 ++++++++++++++++++
 2 files changed, 526 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index f15e020346..71ba5f016a 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -15181,6 +15181,148 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CudnnRNNBackpropV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "CudnnRNNCanonicalToParams"
   input_arg {
@@ -15502,6 +15644,127 @@ op {
     }
   }
 }
+op {
+  name: "CudnnRNNV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Cumprod"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 0211b16b51..90368fe614 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -6641,6 +6641,148 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CudnnRNNBackpropV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "CudnnRNNCanonicalToParams"
   input_arg {
@@ -6962,6 +7104,127 @@ op {
     }
   }
 }
+op {
+  name: "CudnnRNNV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Cumprod"
   input_arg {
-- 
GitLab


From 5a751a41aa92068ca0dbf991524cb4bed90c630f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 19:18:53 -0700
Subject: [PATCH 446/450] Reduce number of combinations that are tested

PiperOrigin-RevId: 194335483
---
 .../linear_optimizer/python/kernel_tests/sdca_ops_test.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
index 6e6c812adc..b5741967ab 100644
--- a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
@@ -39,8 +39,8 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import googletest
 
 _MAX_ITERATIONS = 100
-_SHARD_NUMBERS = [None, 1, 3, 10]
-_NUM_LOSS_PARTITIONS = [2, 4]
+_SHARD_NUMBERS = [None, 1, 3]
+_NUM_LOSS_PARTITIONS = [4]
 
 
 def make_example_proto(feature_dict, target, value=1.0):
-- 
GitLab


From e31dc1c13994d85a95689f0f61092489ebaa83cf Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Wed, 25 Apr 2018 19:43:14 -0700
Subject: [PATCH 447/450] Improve shape invariant error message for
 tf.while_loop.

PiperOrigin-RevId: 194336902
---
 .../kernel_tests/control_flow_ops_py_test.py     |  9 ++++-----
 tensorflow/python/ops/control_flow_ops.py        | 16 +++++++++-------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index e27eb00818..209411cf51 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -1135,11 +1135,10 @@ class ControlFlowTest(test.TestCase):
 
       with self.assertRaisesRegexp(
           ValueError,
-          r"The shape for while_1/Merge_1:0 is not an invariant for the loop. "
-          r"It enters the loop with shape \(2, 2\), but has shape \(4, 2\) "
-          r"after one iteration. Provide shape invariants using either the "
-          r"`shape_invariants` argument of tf.while_loop or set_shape\(\) on "
-          r"the loop variables."):
+          r"Input tensor 'ones:0' enters the loop with shape \(2, 2\), but has "
+          r"shape \(4, 2\) after one iteration. To allow the shape to vary "
+          r"across iterations, use the `shape_invariants` argument of "
+          r"tf.while_loop to specify a less-specific shape."):
         r = control_flow_ops.while_loop(c, b, [i, m])
 
   def testWhileShapeInferenceSparseTensor(self):
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index a1bfe450c8..f1e068d514 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -622,14 +622,16 @@ def _EnforceShapeInvariant(merge_var, next_var):
     m_shape = merge_var.get_shape()
     n_shape = next_var.get_shape()
     if not _ShapeLessThanOrEqual(n_shape, m_shape):
-      # TODO(skyewm): get original loop input that caused the shape error and
-      # report its name instead of the merge node's.
+      enter = merge_var.op.inputs[0].op
+      assert util.IsLoopEnter(enter)
+      input_t = enter.inputs[0]
+      assert input_t.shape == m_shape
       raise ValueError(
-          "The shape for %s is not an invariant for the loop. It enters "
-          "the loop with shape %s, but has shape %s after one iteration. "
-          "Provide shape invariants using either the `shape_invariants` "
-          "argument of tf.while_loop or set_shape() on the loop variables." %
-          (merge_var.name, m_shape, n_shape))
+          "Input tensor '%s' enters the loop with shape %s, but has shape %s "
+          "after one iteration. To allow the shape to vary across iterations, "
+          "use the `shape_invariants` argument of tf.while_loop to specify a "
+          "less-specific shape." %
+          (input_t.name, input_t.shape, n_shape))
   else:
     if not isinstance(var, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
       raise TypeError("Type %s not supported" % type(var))
-- 
GitLab


From 43a7072882196c7ac2d9429050a3140b1ecb52db Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 25 Apr 2018 19:47:06 -0700
Subject: [PATCH 448/450] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 194337205

---
 tensorflow/go/op/wrappers.go | 238 +++++++++++++++++------------------
 1 file changed, 119 insertions(+), 119 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 4d91f2b68e..83de1c5a92 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -2425,6 +2425,125 @@ func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...Gathe
 	return op.Output(0)
 }
 
+// Creates a tensor filled with a scalar value.
+//
+// This operation creates a tensor of shape `dims` and fills it with `value`.
+//
+// For example:
+//
+// ```
+// # Output tensor has shape [2, 3].
+// fill([2, 3], 9) ==> [[9, 9, 9]
+//                      [9, 9, 9]]
+// ```
+//
+// Arguments:
+//	dims: 1-D. Represents the shape of the output tensor.
+//	value: 0-D (scalar). Value to fill the returned tensor.
+//
+// @compatibility(numpy)
+// Equivalent to np.full
+// @end_compatibility
+func Fill(scope *Scope, dims tf.Output, value tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Fill",
+		Input: []tf.Input{
+			dims, value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EditDistanceAttr is an optional argument to EditDistance.
+type EditDistanceAttr func(optionalAttr)
+
+// EditDistanceNormalize sets the optional normalize attribute to value.
+//
+// value: boolean (if true, edit distances are normalized by length of truth).
+//
+// The output is:
+// If not specified, defaults to true
+func EditDistanceNormalize(value bool) EditDistanceAttr {
+	return func(m optionalAttr) {
+		m["normalize"] = value
+	}
+}
+
+// Computes the (possibly normalized) Levenshtein Edit Distance.
+//
+// The inputs are variable-length sequences provided by SparseTensors
+//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
+// and
+//   (truth_indices, truth_values, truth_shape).
+//
+// The inputs are:
+//
+// Arguments:
+//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
+// This is an N x R int64 matrix.
+//	hypothesis_values: The values of the hypothesis list SparseTensor.
+// This is an N-length vector.
+//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
+// This is an R-length vector.
+//	truth_indices: The indices of the truth list SparseTensor.
+// This is an M x R int64 matrix.
+//	truth_values: The values of the truth list SparseTensor.
+// This is an M-length vector.
+//	truth_shape: truth indices, vector.
+//
+// Returns A dense float tensor with rank R - 1.
+//
+// For the example input:
+//
+//     // hypothesis represents a 2x1 matrix with variable-length values:
+//     //   (0,0) = ["a"]
+//     //   (1,0) = ["b"]
+//     hypothesis_indices = [[0, 0, 0],
+//                           [1, 0, 0]]
+//     hypothesis_values = ["a", "b"]
+//     hypothesis_shape = [2, 1, 1]
+//
+//     // truth represents a 2x2 matrix with variable-length values:
+//     //   (0,0) = []
+//     //   (0,1) = ["a"]
+//     //   (1,0) = ["b", "c"]
+//     //   (1,1) = ["a"]
+//     truth_indices = [[0, 1, 0],
+//                      [1, 0, 0],
+//                      [1, 0, 1],
+//                      [1, 1, 0]]
+//     truth_values = ["a", "b", "c", "a"]
+//     truth_shape = [2, 2, 2]
+//     normalize = true
+//
+// The output will be:
+//
+//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
+//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
+//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
+func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EditDistance",
+		Input: []tf.Input{
+			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Clips tensor values to a specified min and max.
 //
 // Given a tensor `t`, this operation returns a tensor of the same type and
@@ -13001,39 +13120,6 @@ func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Creates a tensor filled with a scalar value.
-//
-// This operation creates a tensor of shape `dims` and fills it with `value`.
-//
-// For example:
-//
-// ```
-// # Output tensor has shape [2, 3].
-// fill([2, 3], 9) ==> [[9, 9, 9]
-//                      [9, 9, 9]]
-// ```
-//
-// Arguments:
-//	dims: 1-D. Represents the shape of the output tensor.
-//	value: 0-D (scalar). Value to fill the returned tensor.
-//
-// @compatibility(numpy)
-// Equivalent to np.full
-// @end_compatibility
-func Fill(scope *Scope, dims tf.Output, value tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Fill",
-		Input: []tf.Input{
-			dims, value,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // 2D fast Fourier transform.
 //
 // Computes the 2-dimensional discrete Fourier transform over the inner-most
@@ -30624,89 +30710,3 @@ func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
-
-// EditDistanceAttr is an optional argument to EditDistance.
-type EditDistanceAttr func(optionalAttr)
-
-// EditDistanceNormalize sets the optional normalize attribute to value.
-//
-// value: boolean (if true, edit distances are normalized by length of truth).
-//
-// The output is:
-// If not specified, defaults to true
-func EditDistanceNormalize(value bool) EditDistanceAttr {
-	return func(m optionalAttr) {
-		m["normalize"] = value
-	}
-}
-
-// Computes the (possibly normalized) Levenshtein Edit Distance.
-//
-// The inputs are variable-length sequences provided by SparseTensors
-//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
-// and
-//   (truth_indices, truth_values, truth_shape).
-//
-// The inputs are:
-//
-// Arguments:
-//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
-// This is an N x R int64 matrix.
-//	hypothesis_values: The values of the hypothesis list SparseTensor.
-// This is an N-length vector.
-//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
-// This is an R-length vector.
-//	truth_indices: The indices of the truth list SparseTensor.
-// This is an M x R int64 matrix.
-//	truth_values: The values of the truth list SparseTensor.
-// This is an M-length vector.
-//	truth_shape: truth indices, vector.
-//
-// Returns A dense float tensor with rank R - 1.
-//
-// For the example input:
-//
-//     // hypothesis represents a 2x1 matrix with variable-length values:
-//     //   (0,0) = ["a"]
-//     //   (1,0) = ["b"]
-//     hypothesis_indices = [[0, 0, 0],
-//                           [1, 0, 0]]
-//     hypothesis_values = ["a", "b"]
-//     hypothesis_shape = [2, 1, 1]
-//
-//     // truth represents a 2x2 matrix with variable-length values:
-//     //   (0,0) = []
-//     //   (0,1) = ["a"]
-//     //   (1,0) = ["b", "c"]
-//     //   (1,1) = ["a"]
-//     truth_indices = [[0, 1, 0],
-//                      [1, 0, 0],
-//                      [1, 0, 1],
-//                      [1, 1, 0]]
-//     truth_values = ["a", "b", "c", "a"]
-//     truth_shape = [2, 2, 2]
-//     normalize = true
-//
-// The output will be:
-//
-//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
-//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
-//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
-func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EditDistance",
-		Input: []tf.Input{
-			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-- 
GitLab


From a2cdf98cec1e1d58eb572f45c19a6806da80f59b Mon Sep 17 00:00:00 2001
From: Patrick Nguyen <drpng@google.com>
Date: Thu, 26 Apr 2018 10:00:06 -0700
Subject: [PATCH 449/450] Fix conflict.

Use the exposed Scaffold.default_init_op instead of _default_init_op.
---
 tensorflow/python/estimator/estimator.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 2f1212d5a2..2dd5e1e8aa 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -636,11 +636,9 @@ class Estimator(object):
               sharded=True)
           saver_for_restore.restore(session, checkpoint_path)
 
-          # pylint: disable=protected-access
           local_init_op = (
               estimator_spec.scaffold.local_init_op or
-              monitored_session.Scaffold._default_local_init_op())
-          # pylint: enable=protected-access
+              monitored_session.Scaffold.default_local_init_op())
 
           # Perform the export
           builder = saved_model_builder.SavedModelBuilder(temp_export_dir)
-- 
GitLab


From 1bf9ec7f8545c7aa6fa915c6576a3b984af59ded Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 26 Apr 2018 13:47:51 -0700
Subject: [PATCH 450/450] Fix cmake build issues with GPU on Linux (#18775)

* Update include file for cmake

While attempting to build tensorflow with cmake for gpu the following
build error happens:
```
/home/ubuntu/tensorflow/tensorflow/core/platform/default/gpu/cupti_wrapper.h:26:45: fatal error: cuda/extras/CUPTI/include/cupti.h: No such file or directory
compilation terminated.
CMakeFiles/tf_core_cpu.dir/build.make:3302: recipe for target 'CMakeFiles/tf_core_cpu.dir/home/ubuntu/tensorflow/tensorflow/core/platform/default/gpu/cupti_wrapper.cc.o' failed
make[2]: *** [CMakeFiles/tf_core_cpu.dir/home/ubuntu/tensorflow/tensorflow/core/platform/default/gpu/cupti_wrapper.cc.o] Error 1
CMakeFiles/Makefile2:2402: recipe for target 'CMakeFiles/tf_core_cpu.dir/all' failed
make[1]: *** [CMakeFiles/tf_core_cpu.dir/all] Error 2
Makefile:127: recipe for target 'all' failed
make: *** [all] Error 2
```
This fix tries to fix the build error above for cmake.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add xla dependency to stream_executor

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Remove nccl files temporarily

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add gomp library to libcusolver.so

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Fix cmake issue

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/cmake/CMakeLists.txt              |  4 ++++
 tensorflow/contrib/cmake/tf_core_kernels.cmake       | 10 ++++++++++
 tensorflow/contrib/cmake/tf_stream_executor.cmake    |  2 ++
 tensorflow/core/platform/default/gpu/cupti_wrapper.h |  2 +-
 4 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 5f38a8e5c7..d75b1b12a6 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -471,6 +471,10 @@ if (tensorflow_ENABLE_GPU)
   include_directories(${tensorflow_source_dir}/third_party/gpus)
   # add cuda libraries to tensorflow_EXTERNAL_LIBRARIES
   list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CUDA_LIBRARIES})
+  if(NOT WIN32)
+    # add gomp to tensorflow_EXTERNAL_LIBRARIES, needed by libcusolver.so
+    list(APPEND tensorflow_EXTERNAL_LIBRARIES gomp)
+  endif()
 
   # NOTE(mrry): Update these flags when the version of CUDA or cuDNN used
   # in the default build is upgraded.
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index 376496b33f..f38c9e0513 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -177,6 +177,16 @@ if(WIN32)
       "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc"
   )
   list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_windows_exclude_srcs})
+else(WIN32)
+  if(tensorflow_ENABLE_GPU)
+    file(GLOB_RECURSE tf_core_kernels_gpu_exclude_srcs
+        # temporarily disable nccl as it needs to be ported with gpu
+        "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_manager.cc"
+        "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_ops.cc"
+        "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc"
+    )
+    list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_gpu_exclude_srcs})
+  endif(tensorflow_ENABLE_GPU)
 endif(WIN32)
 
 file(GLOB_RECURSE tf_core_gpu_kernels_srcs
diff --git a/tensorflow/contrib/cmake/tf_stream_executor.cmake b/tensorflow/contrib/cmake/tf_stream_executor.cmake
index af48ef1fd4..9a37b68119 100644
--- a/tensorflow/contrib/cmake/tf_stream_executor.cmake
+++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake
@@ -64,6 +64,8 @@ file(GLOB tf_stream_executor_srcs
 if (tensorflow_ENABLE_GPU)
     file(GLOB tf_stream_executor_gpu_srcs
         "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*.cc"
+        "${tensorflow_source_dir}/tensorflow/compiler/xla/statusor.h"
+        "${tensorflow_source_dir}/tensorflow/compiler/xla/statusor.cc"
     )
     if (NOT tensorflow_BUILD_CC_TESTS)
         file(GLOB tf_stream_executor_gpu_tests
diff --git a/tensorflow/core/platform/default/gpu/cupti_wrapper.h b/tensorflow/core/platform/default/gpu/cupti_wrapper.h
index acd889e474..e3ebe6ca1d 100644
--- a/tensorflow/core/platform/default/gpu/cupti_wrapper.h
+++ b/tensorflow/core/platform/default/gpu/cupti_wrapper.h
@@ -23,7 +23,7 @@ limitations under the License.
 #if defined(WIN32)
 #include "extras/CUPTI/include/cupti.h"
 #else
-#include "cuda/extras/CUPTI/include/cupti.h"
+#include "cupti.h"
 #endif
 namespace perftools {
 namespace gputools {
-- 
GitLab